diff --git a/ansible.cfg b/ansible.cfg new file mode 100644 index 000000000..09c5b9fb9 --- /dev/null +++ b/ansible.cfg @@ -0,0 +1,19 @@ +# Only used for Azimuth running the caas environment +[defaults] +any_errors_fatal = True +gathering = smart +forks = 30 +host_key_checking = False +remote_tmp = /tmp +collections_path = ansible/collections +roles_path = ansible/roles +filter_plugins = ansible/filter_plugins +callbacks_enabled = ansible.posix.profile_tasks + +[ssh_connection] +ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +pipelining = True +# This is important because we are using one of the hosts in the play as a jump host +# This ensures that if the proxy connection is interrupted, rendering the other hosts +# unreachable, the connection is retried instead of failing the entire play +retries = 10 diff --git a/ansible/.gitignore b/ansible/.gitignore index d9e43b198..700bcce93 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -28,8 +28,6 @@ roles/* !roles/firewalld/** !roles/etc_hosts/ !roles/etc_hosts/** -!roles/cloud_init/ -!roles/cloud_init/** !roles/mysql/ !roles/mysql/** !roles/systemd/ @@ -46,3 +44,16 @@ roles/* !roles/prometheus/** !roles/cve-2023-41914 !roles/cve-2023-41914/** +!roles/cluster_infra/ +!roles/cluster_infra/** +!roles/image_build_infra/ +!roles/image_build_infra/** +!roles/persist_openhpc_secrets/ +!roles/persist_openhpc_secrets/** +!roles/zenith_proxy/ +!roles/zenith_proxy/** +!roles/image_build/ +!roles/image_build/** +!roles/persist_hostkeys/ +!roles/persist_hostkeys/** +!roles/requirements.yml diff --git a/ansible/adhoc/backup-keytabs.yml b/ansible/adhoc/backup-keytabs.yml new file mode 100644 index 000000000..5566e48ac --- /dev/null +++ b/ansible/adhoc/backup-keytabs.yml @@ -0,0 +1,11 @@ +# Use ONE of the following tags on this playbook: +# - retrieve: copies keytabs out of the state volume to the environment +# - deploy: copies keytabs from the environment to the state volume + +- hosts: freeipa_client + become: yes + gather_facts: no + tasks: + - import_role: + name: freeipa + tasks_from: backup-keytabs.yml diff --git a/ansible/adhoc/template-cloud-init.yml b/ansible/adhoc/template-cloud-init.yml deleted file mode 100644 index 92bb14a5d..000000000 --- a/ansible/adhoc/template-cloud-init.yml +++ /dev/null @@ -1,9 +0,0 @@ -- hosts: cloud_init - become: no - gather_facts: no - tasks: - - name: Template out cloud-init userdata - import_role: - name: cloud_init - tasks_from: template.yml - delegate_to: localhost diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 485c637f5..9b6fda0de 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -82,6 +82,21 @@ policy: "{{ selinux_policy }}" register: sestatus +- hosts: freeipa_server + # Done here as it might be providing DNS + tags: + - freeipa + - freeipa_server + gather_facts: yes + become: yes + tasks: + - name: Install FreeIPA server + import_role: + name: freeipa + tasks_from: server.yml + +# --- tasks after here require access to package repos --- + - hosts: firewalld gather_facts: false become: yes @@ -99,6 +114,7 @@ name: fail2ban - name: Setup podman + gather_facts: false hosts: podman tags: podman tasks: diff --git a/ansible/extras.yml b/ansible/extras.yml index efb39f40f..0a27d1806 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -1,3 +1,13 @@ +- hosts: basic_users + become: yes + tags: + - basic_users + - users + gather_facts: yes + tasks: + - import_role: + name: basic_users + - name: Setup EESSI hosts: eessi tags: eessi diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index c6b182491..5cabeec97 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -34,6 +34,13 @@ state: stopped enabled: false + # - import_playbook: iam.yml + - name: Install FreeIPA client + import_role: + name: freeipa + tasks_from: client-install.yml + when: "'freeipa_client' in group_names" + # - import_playbook: filesystems.yml - name: nfs dnf: @@ -102,8 +109,6 @@ name: cloudalchemy.grafana tasks_from: install.yml - # - import_playbook: iam.yml - nothing to do - - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/iam.yml b/ansible/iam.yml index 266bca1ab..0286b9df3 100644 --- a/ansible/iam.yml +++ b/ansible/iam.yml @@ -1,8 +1,42 @@ -- hosts: basic_users +- hosts: freeipa_client + tags: + - freeipa + - freeipa_server # as this is only relevant if using freeipa_server + - freeipa_host + gather_facts: no become: yes + tasks: + - name: Ensure FreeIPA client hosts are added to the FreeIPA server + import_role: + name: freeipa + tasks_from: addhost.yml + when: groups['freeipa_server'] | length > 0 + +- hosts: freeipa_client tags: - - basic_users + - freeipa + - freeipa_client gather_facts: yes + become: yes + tasks: + - name: Install FreeIPA client + import_role: + name: freeipa + tasks_from: client-install.yml + - name: Enrol FreeIPA client + import_role: + name: freeipa + tasks_from: enrol.yml + +- hosts: freeipa_server + tags: + - freeipa + - freeipa_server + - users + gather_facts: yes + become: yes tasks: - - import_role: - name: basic_users + - name: Add FreeIPA users + import_role: + name: freeipa + tasks_from: users.yml diff --git a/ansible/noop.yml b/ansible/noop.yml index 49317736a..adad24813 100644 --- a/ansible/noop.yml +++ b/ansible/noop.yml @@ -6,4 +6,4 @@ - hosts: localhost gather_facts: false - tasks: [] \ No newline at end of file + tasks: [] diff --git a/ansible/roles/cluster_infra/defaults/main.yml b/ansible/roles/cluster_infra/defaults/main.yml new file mode 100644 index 000000000..ef8ea609b --- /dev/null +++ b/ansible/roles/cluster_infra/defaults/main.yml @@ -0,0 +1,7 @@ +cluster_deploy_ssh_keys_extra: [] + +# List of hw_scsi_models that result in block devices presenting as /dev/sdX +# rather than /dev/vdX +scsi_models: + # Ceph [https://docs.ceph.com/en/quincy/rbd/rbd-openstack/#image-properties] + - virtio-scsi diff --git a/ansible/roles/cluster_infra/tasks/main.yml b/ansible/roles/cluster_infra/tasks/main.yml new file mode 100644 index 000000000..cacf7b6a0 --- /dev/null +++ b/ansible/roles/cluster_infra/tasks/main.yml @@ -0,0 +1,104 @@ +- debug: + msg: | + terraform_backend_type: {{ terraform_backend_type }} + terraform_state: {{ terraform_state }} + cluster_upgrade_system_packages: {{ cluster_upgrade_system_packages | default('undefined') }} + +# We need to convert the floating IP id to an address for Terraform +# if we we have cluster_floating_ip, otherwise assume that we're +# assigning the FIP in Terraform and that it will be available in +# outputs.cluster_gateway_ip. +- block: + - name: Look up floating IP + include_role: + name: stackhpc.terraform.infra + tasks_from: lookup_floating_ip + vars: + os_floating_ip_id: "{{ cluster_floating_ip }}" + + - name: Set floating IP address fact + set_fact: + cluster_floating_ip_address: "{{ os_floating_ip_info.floating_ip_address }}" + when: cluster_floating_ip is defined + +- name: Install Terraform binary + include_role: + name: stackhpc.terraform.install + +- name: Make Terraform project directory + file: + path: "{{ terraform_project_path }}" + state: directory + +- name: Write backend configuration + copy: + content: | + terraform { + backend "{{ terraform_backend_type }}" { } + } + dest: "{{ terraform_project_path }}/backend.tf" + +# Patching in this appliance is implemented as a switch to a new base image +# So unless explicitly patching, we want to use the same image as last time +# To do this, we query the previous Terraform state before updating +- block: + - name: Get previous Terraform state + stackhpc.terraform.terraform_output: + binary_path: "{{ terraform_binary_path }}" + project_path: "{{ terraform_project_path }}" + backend_config: "{{ terraform_backend_config }}" + register: cluster_infra_terraform_output + + - name: Extract image from Terraform state + set_fact: + cluster_previous_image: "{{ cluster_infra_terraform_output.outputs.cluster_image.value }}" + when: '"cluster_image" in cluster_infra_terraform_output.outputs' + when: + - terraform_state == "present" + - cluster_upgrade_system_packages is not defined or not cluster_upgrade_system_packages + +- name: Detect volume device prefix from image metadata + block: + - name: Get image metadata from OpenStack API + openstack.cloud.image_info: + image: "{{ cluster_previous_image | default(cluster_image) }}" + register: cluster_image_info + - name: Check only single image found + assert: + that: cluster_image_info.images | length == 1 + fail_msg: "Multiple images found for 'cluster_image' {{ cluster_image }}" + - name: Set volume_device_prefix fact + set_fact: + block_device_prefix: >- + {{ + 'sd' if (cluster_image_info.images | first).hw_scsi_model is defined and + (cluster_image_info.images | first).hw_scsi_model in scsi_models + else 'vd' + }} + # Only run when block_device_prefix isn't set as an extravar + when: + - block_device_prefix is not defined + - cluster_image is defined + +- name: Template Terraform files into project directory + template: + src: >- + {{ + "{}{}.j2".format( + ( + cluster_terraform_template_dir ~ "/" + if cluster_terraform_template_dir is defined + else "" + ), + item + ) + }} + dest: "{{ terraform_project_path }}/{{ item }}" + loop: + - outputs.tf + - providers.tf + - resources.tf + +- name: Provision infrastructure + include_role: + name: stackhpc.terraform.infra diff --git a/ansible/roles/cluster_infra/templates/outputs.tf.j2 b/ansible/roles/cluster_infra/templates/outputs.tf.j2 new file mode 100644 index 000000000..70b57d119 --- /dev/null +++ b/ansible/roles/cluster_infra/templates/outputs.tf.j2 @@ -0,0 +1,53 @@ +output "cluster_gateway_ip" { + description = "The IP address of the gateway used to contact the cluster nodes" + value = openstack_compute_floatingip_associate_v2.login_floatingip_assoc.floating_ip +} + +{% if cluster_ssh_private_key_file is not defined %} +output "cluster_ssh_private_key" { + description = "The private component of the keypair generated on cluster provision" + value = openstack_compute_keypair_v2.cluster_keypair.private_key + sensitive = true +} +{% endif %} + +output "cluster_nodes" { + description = "A list of the nodes in the cluster from which an Ansible inventory will be populated" + value = concat( + [ + { + name = openstack_compute_instance_v2.login.name + ip = openstack_compute_instance_v2.login.network[0].fixed_ip_v4 + groups = ["login", "{{ cluster_name }}_login"], + facts = { + openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id + } + }, + { + name = openstack_compute_instance_v2.control.name + ip = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + groups = ["control", "{{ cluster_name }}_control"], + facts = { + openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id + } + } + ], + {% for partition in openhpc_slurm_partitions %} + [ + for compute in openstack_compute_instance_v2.{{ partition.name }}: { + name = compute.name + ip = compute.network[0].fixed_ip_v4 + groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ partition.name }}"], + facts = { + openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id + } + } + ]{{ ',' if not loop.last }} + {% endfor %} + ) +} + +output "cluster_image" { + description = "The id of the image used to build the cluster nodes" + value = "{{ cluster_previous_image | default(cluster_image) }}" +} diff --git a/ansible/roles/cluster_infra/templates/providers.tf.j2 b/ansible/roles/cluster_infra/templates/providers.tf.j2 new file mode 100644 index 000000000..32a16f27b --- /dev/null +++ b/ansible/roles/cluster_infra/templates/providers.tf.j2 @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 0.14" + + # We need the OpenStack provider + required_providers { + openstack = { + source = "terraform-provider-openstack/openstack" + } + } +} diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 new file mode 100644 index 000000000..1a40361e4 --- /dev/null +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -0,0 +1,452 @@ +#jinja2: trim_blocks:False +##### +##### The identity scope we are operating in +##### Used to output the OpenStack project ID as a fact for provisioned hosts +##### +data "openstack_identity_auth_scope_v3" "scope" { + name = "{{ cluster_name }}" +} + +##### +##### Security groups for the cluster +##### + +# Security group to hold common rules for the cluster +resource "openstack_networking_secgroup_v2" "secgroup_slurm_cluster" { + name = "{{ cluster_name }}-secgroup-slurm-cluster" + description = "Rules for the slurm cluster nodes" + delete_default_rules = true # Fully manage with terraform +} + +# Security group to hold specific rules for the login node +resource "openstack_networking_secgroup_v2" "secgroup_slurm_login" { + name = "{{ cluster_name }}-secgroup-slurm-login" + description = "Specific rules for the slurm login node" + delete_default_rules = true # Fully manage with terraform +} + +## Allow all egress for all cluster nodes +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_cluster_rule_egress_v4" { + direction = "egress" + ethertype = "IPv4" + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}" +} + +## Allow all ingress between nodes in the cluster +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_cluster_rule_ingress_internal_v4" { + direction = "ingress" + ethertype = "IPv4" + remote_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}" + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}" +} + +## Allow ingress on port 22 (SSH) from anywhere for the login nodes +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingress_ssh_v4" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 22 + port_range_max = 22 + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}" +} + +## Allow ingress on port 443 (HTTPS) from anywhere for the login nodes +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingress_https_v4" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 443 + port_range_max = 443 + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}" +} + +## Allow ingress on port 80 (HTTP) from anywhere for the login nodes +resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingress_http_v4" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 80 + port_range_max = 80 + security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}" +} + +##### +##### Volumes +##### +resource "openstack_blockstorage_volume_v3" "state" { + name = "{{ cluster_name }}-state" + description = "State for control node" + size = "{{ state_volume_size }}" +} + +resource "openstack_blockstorage_volume_v3" "home" { + name = "{{ cluster_name }}-home" + description = "Home for control node" + size = "{{ home_volume_size }}" + {% if use_home_volume_type_fast is defined and use_home_volume_type_fast %} + {% if home_volume_type_fast is defined %} + volume_type = "{{ home_volume_type_fast }}" + {% endif %} + {% endif %} +} + +###### +###### Cluster network +###### + +# Always get cluster_external_network network and subnet data +data "openstack_networking_network_v2" "cluster_external_network" { + name = "{{ cluster_external_network }}" +} + +data "openstack_networking_subnet_ids_v2" "cluster_external_subnets" { + network_id = "${data.openstack_networking_network_v2.cluster_external_network.id}" +} + +{% if cluster_network is not defined %} +# Create a new network +resource "openstack_networking_network_v2" "cluster_network" { + name = "{{ cluster_name }}-net" + admin_state_up = "true" +} + +resource "openstack_networking_subnet_v2" "cluster_subnet" { + name = "{{ cluster_name }}-subnet" + network_id = "${openstack_networking_network_v2.cluster_network.id}" + cidr = "{{ cluster_cidr | default('192.168.44.0/24') }}" + {% if cluster_nameservers is defined %} + dns_nameservers = [ + {% for nameserver in cluster_nameservers %} + "{{ nameserver }}"{{ ',' if not loop.last }} + {% endfor %} + ] + {% endif %} + ip_version = 4 +} + +resource "openstack_networking_router_v2" "cluster_router" { + name = "{{ cluster_name }}-router" + admin_state_up = true + external_network_id = "${data.openstack_networking_network_v2.cluster_external_network.id}" +} + +resource "openstack_networking_router_interface_v2" "cluster_router_interface" { + router_id = "${openstack_networking_router_v2.cluster_router.id}" + subnet_id = "${openstack_networking_subnet_v2.cluster_subnet.id}" +} +{% endif %} + +# Get existing network resource data by name, from either the created +# network or the network name if supplied +data "openstack_networking_network_v2" "cluster_network" { + {% if cluster_network is not defined %} + network_id = "${openstack_networking_network_v2.cluster_network.id}" + {% else %} + name = "{{ cluster_network }}" + {% endif %} +} + +data "openstack_networking_subnet_v2" "cluster_subnet" { + # Get subnet data from the subnet we create, or if it exists already + # get it from the cluster network data above + {% if cluster_network is not defined %} + subnet_id = "${openstack_networking_subnet_v2.cluster_subnet.id}" + {% else %} + network_id = "${data.openstack_networking_network_v2.cluster_network.id}" + {% endif %} +} + +##### +##### Cluster ports +##### + +resource "openstack_networking_port_v2" "login" { + name = "{{ cluster_name }}-login-0" + network_id = "${data.openstack_networking_network_v2.cluster_network.id}" + admin_state_up = "true" + + fixed_ip { + subnet_id = "${data.openstack_networking_subnet_v2.cluster_subnet.id}" + } + + security_group_ids = [ + "${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}", + "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}" + ] + + binding { + vnic_type = "{{ cluster_vnic_type | default('normal') }}" + {% if cluster_vnic_profile is defined %} + profile = </groups + ... + [resolv_conf:children] + freeipa_client + ... + ``` + + ```yaml + # environments//inventory/group_vars/all/resolv_conf.yml + resolv_conf_nameservers: + - "{{ hostvars[groups['freeipa_server'] | first].ansible_host }}" + ``` + + +- For production use with an external FreeIPA server, a random one-time password (OTP) must be generated when adding hosts to FreeIPA (e.g. using `ipa host-add --random ...`). This password should be set as a hostvar `freeipa_host_password`. Initial host enrolment will use this OTP to enrol the host. After this it becomes irrelevant so it does not need to be committed to git. This approach means the appliance does not require the FreeIPA administrator password. +- For development use with the in-appliance FreeIPA server, `freeipa_host_password` will be automatically generated in memory. +- The `control` host must define `appliances_state_dir` (on persistent storage). This is used to back-up keytabs to allow FreeIPA clients to automatically re-enrol after e.g. reimaging. Note that: + - This is implemented when using the skeleton Terraform; on the control node `appliances_state_dir` defaults to `/var/lib/state` which is mounted from a volume. + - Nodes are not re-enroled by a [Slurm-driven reimage](../../collections/ansible_collections/stackhpc/slurm_openstack_tools/roles/rebuild/README.md) (as that does not run this role). + - If both a backed-up keytab and `freeipa_host_password` exist, the former is used. + + +## Role Variables for Clients + +- `freeipa_host_password`. Required for initial enrolment only, FreeIPA host password as described above. +- `freeipa_setup_dns`: Optional, whether to use the FreeIPA server as the client's nameserver. Defaults to `true` when `freeipa_server` contains a host, otherwise `false`. + +See also use of `appliances_state_dir` on the control node as described above. + +# FreeIPA Server +As noted above this is only intended for development and testing. Note it cannot be run on the `openondemand` node as no other virtual servers must be defined in the Apache configuration. + +## Usage +- Add a single host to the `freeipa_server` group and run (at a minimum) the `ansible/bootstrap.yml` and `ansible/iam.yml` playbooks. +- As well as configuring the FreeIPA server, the role will also: + - Add ansible hosts in the group `freeipa_client` as FreeIPA hosts. + - Optionally control users in FreeIPA - see `freeipa_users` below. + +The FreeIPA GUI will be available on `https:///ipa/ui`. + +## Role Variables for Server + +These role variables are only required when using `freeipa_server`: + +- `freeipa_realm`: Optional, name of realm. Default is `{{ openhpc_cluster_name | upper }}.INVALID` +- `freeipa_domain`: Optional, name of domain. Default is lowercased `freeipa_realm`. +- `freeipa_ds_password`: Optional, password to be used by the Directory Server for the Directory Manager user (`ipa-server-install --ds-password`). Default is generated in `environments//inventory/group_vars/all/secrets.yml` +- `freeipa_admin_password`: Optional, password for the IPA `admin` user. Default is generated as for `freeipa_ds_password`. +- `freeipa_server_ip`: Optional, IP address of freeipa_server host. Default is `ansible_host` of the `freeipa_server` host. Default `false`. +- `freeipa_setup_dns`: Optional bool, whether to configure the FreeIPA server as an integrated DNS server and define a zone and records. NB: This also controls whether `freeipa_client` hosts use the `freeipa_server` host for name resolution. Default `true` when `freeipa_server` contains a host. +- `freeipa_client_ip`: Optional, IP address of FreeIPA client. Default is `ansible_host`. +- `freeipa_users`: A list of dicts defining users to add, with keys/values as for [community.general.ipa_user](https://docs.ansible.com/ansible/latest/collections/community/general/ipa_user_module.html): Note that: + - `name`, `givenname` (firstname) and `sn` (surname) are required. + - `ipa_host`, `ipa_port`, `ipa_prot`, `ipa_user`, `validate_certs` are automatically provided and cannot be overridden. + - If `password` is set, the value should *not* be a hash (unlike `ansible.builtin.user` as used by the `basic_users` role), and it must be changed on first login. `krbpasswordexpiration` does not appear to be able to override this. diff --git a/ansible/roles/freeipa/defaults/main.yml b/ansible/roles/freeipa/defaults/main.yml new file mode 100644 index 000000000..03b844c8a --- /dev/null +++ b/ansible/roles/freeipa/defaults/main.yml @@ -0,0 +1,14 @@ +#freeipa_realm: +freeipa_domain: "{{ freeipa_realm | lower }}" +#freeipa_ds_password: +#freeipa_admin_password: +#freeipa_server_ip: +freeipa_setup_dns: "{{ groups['freeipa_server'] | length > 0 }}" +freeipa_client_ip: "{{ ansible_host }}" # when run on freeipa_client group! +# freeipa_host_password: +freeipa_user_defaults: + ipa_pass: "{{ freeipa_admin_password | quote }}" + ipa_user: admin +freeipa_users: [] # see community.general.ipa_user + +_freeipa_keytab_backup_path: "{{ hostvars[groups['control'].0].appliances_state_dir }}/freeipa/{{ inventory_hostname }}/krb5.keytab" diff --git a/ansible/roles/freeipa/tasks/addhost.yml b/ansible/roles/freeipa/tasks/addhost.yml new file mode 100644 index 000000000..cf1f4475a --- /dev/null +++ b/ansible/roles/freeipa/tasks/addhost.yml @@ -0,0 +1,35 @@ +- name: Get ipa host information + # This uses DNS to find the ipa server, which works as this is running on the enrolled ipa server + # It doesn't fail even if the host doesn't exist + community.general.ipa_host: + name: "{{ node_fqdn }}" + ip_address: "{{ freeipa_client_ip }}" + ipa_host: "{{ groups['freeipa_server'].0 }}" + ipa_pass: "{{ vault_freeipa_admin_password }}" + ipa_user: admin + state: present + validate_certs: false + delegate_to: "{{ groups['freeipa_server'].0 }}" + register: _ipa_host_check + check_mode: yes + changed_when: false + +- name: Add host to IPA + # Using random_password=true this unenroles an enroled host, hence the check above + community.general.ipa_host: + name: "{{ node_fqdn }}" + ip_address: "{{ freeipa_client_ip }}" + ipa_host: "{{ groups['freeipa_server'].0 }}" + ipa_pass: "{{ vault_freeipa_admin_password }}" + ipa_user: admin + random_password: true + state: present + validate_certs: false + delegate_to: "{{ groups['freeipa_server'].0 }}" + when: "'sshpubkeyfp' not in _ipa_host_check.host" + register: _ipa_host_add + +- name: Set fact for ipa host password + set_fact: + freeipa_host_password: "{{ _ipa_host_add.host.randompassword }}" + when: _ipa_host_add.changed diff --git a/ansible/roles/freeipa/tasks/backup-keytabs.yml b/ansible/roles/freeipa/tasks/backup-keytabs.yml new file mode 100644 index 000000000..7fc77f9e1 --- /dev/null +++ b/ansible/roles/freeipa/tasks/backup-keytabs.yml @@ -0,0 +1,14 @@ +- name: Retrieve keytabs to localhost + fetch: + src: "{{ _freeipa_keytab_backup_path }}" + dest: "{{ appliances_environment_root }}/keytabs/{{ inventory_hostname }}/" + flat: true + delegate_to: "{{ groups['control'].0 }}" + tags: retrieve + +- name: Copy keytabs back to control node + copy: + src: "{{ appliances_environment_root }}/keytabs/{{ inventory_hostname }}/" + dest: "{{ _freeipa_keytab_backup_path | dirname }}" + delegate_to: "{{ groups['control'].0 }}" + tags: deploy diff --git a/ansible/roles/freeipa/tasks/client-install.yml b/ansible/roles/freeipa/tasks/client-install.yml new file mode 100644 index 000000000..a164cd26e --- /dev/null +++ b/ansible/roles/freeipa/tasks/client-install.yml @@ -0,0 +1,4 @@ + +- name: Install FreeIPA client package + dnf: + name: ipa-client diff --git a/ansible/roles/freeipa/tasks/enrol.yml b/ansible/roles/freeipa/tasks/enrol.yml new file mode 100644 index 000000000..07436509b --- /dev/null +++ b/ansible/roles/freeipa/tasks/enrol.yml @@ -0,0 +1,87 @@ +# based on https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/installing_identity_management/assembly_installing-an-idm-client_installing-identity-management + +- name: Retrieve persisted keytab from previous enrolement + slurp: + src: "{{ _freeipa_keytab_backup_path }}" + delegate_to: "{{ groups['control'] | first }}" + register: _slurp_persisted_keytab + failed_when: false + +- name: Write persisted keytab from previous enrolment + copy: + content: "{{ _slurp_persisted_keytab.content | b64decode }}" + dest: /tmp/krb5.keytab + owner: root + group: root + mode: ug=rw,o= + when: '"content" in _slurp_persisted_keytab' + +- name: Re-enrol with FreeIPA using backed-up keytab + # Re-enrolment requires --force-join and --password, or --keytab + # Re-rolement means: + # 1. A new host certificate is issued + # 2. The old host certificate is revoked + # 3. New SSH keys are generated + # 4. ipaUniqueID is preserved + # and ALSO that the keytab is changed! + command: + cmd: > + ipa-client-install + --unattended + --mkhomedir + --enable-dns-updates + --keytab /tmp/krb5.keytab + when: '"content" in _slurp_persisted_keytab' + register: ipa_client_install_keytab + changed_when: ipa_client_install_keytab.rc == 0 + failed_when: > + ipa_client_install_keytab.rc !=0 and + 'IPA client is already configured' not in ipa_client_install_keytab.stderr + +- name: Enrol with FreeIPA using random password + # Note --password is overloaded - it's bulkpassword unless --principal or --force-join is used in which case it's admin password + command: + cmd: > + ipa-client-install + --unattended + --mkhomedir + --enable-dns-updates + --password '{{ freeipa_host_password }}' + when: + - '"content" not in _slurp_persisted_keytab' + - freeipa_host_password is defined + register: ipa_client_install_password + changed_when: ipa_client_install_password.rc == 0 + failed_when: > + ipa_client_install_password.rc != 0 and + 'IPA client is already configured' not in ipa_client_install_password.stderr + +- name: Ensure NFS RPC security service is running + # This service is installed by nfs-utils, which attempts to start it. + # It has ConditionPathExists=/etc/krb5.keytab which fails if host is not enroled. + # This task avoids a reboot. + systemd: + name: rpc-gssd.service + state: started + enabled: true + +- name: Retrieve current keytab + slurp: + src: /etc/krb5.keytab + register: _slurp_current_keytab + failed_when: false + +- name: Ensure keytab backup directory exists + file: + path: "{{ _freeipa_keytab_backup_path | dirname }}" + state: directory + owner: root + group: root + mode: ug=wrX,o= + delegate_to: "{{ groups['control'] | first }}" + +- name: Persist keytab + copy: + content: "{{ _slurp_current_keytab.content | b64decode }}" + dest: "{{ _freeipa_keytab_backup_path }}" + delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/freeipa/tasks/server.yml b/ansible/roles/freeipa/tasks/server.yml new file mode 100644 index 000000000..33e15733d --- /dev/null +++ b/ansible/roles/freeipa/tasks/server.yml @@ -0,0 +1,63 @@ +# Based on https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/installing_identity_management/preparing-the-system-for-ipa-server-installation_installing-identity-management#host-name-and-dns-requirements-for-ipa_preparing-the-system-for-ipa-server-installation + +- name: Install freeipa server packages + dnf: + name: '@idm:DL1/dns' + state: present + +- name: Install ipa server +# TODO: make no-ui-redirect and dns configurable?? +# TODO: set file mask as per docs? Would be hard to cope with failures. Doesn't appear to be necessary actually. + command: + cmd: > + ipa-server-install + --realm {{ freeipa_realm | quote }} + --domain {{ freeipa_domain | lower | quote }} + --ds-password {{ freeipa_ds_password | quote }} + --admin-password {{ freeipa_admin_password | quote }} + --ip-address={{ freeipa_server_ip }} + {% if freeipa_setup_dns | bool %}--setup-dns{% endif %} + --auto-reverse + --auto-forwarders + --no-dnssec-validation + --no-ntp + --unattended + --no-ui-redirect + no_log: "{{ no_log | default(true) }}" + register: _ipa_server_install + changed_when: _ipa_server_install.rc == 0 + failed_when: > + (_ipa_server_install.rc != 0) and + ('IPA server is already configured' not in _ipa_server_install.stderr) + +- name: Disable redirects to hard-coded domain + # see https://pagure.io/freeipa/issue/7479 + replace: path=/etc/httpd/conf.d/ipa-rewrite.conf regexp='{{ item.regexp }}' replace='{{ item.replace }}' + with_items: + # RewriteRule ^/$ https://${FQDN}/ipa/ui [L,NC,R=301] - irrelevant if using --no-ui-redirect + - regexp: '^(RewriteRule \^/\$) (https://.*)(/ipa/ui.*)$' + replace: '\1 \3' + # RewriteRule ^/ipa/(.*) - occurs twice + - regexp: '^(RewriteRule \^\/ipa\/\(.*)$' + replace: '#\1' + - regexp: '^(RewriteCond .*)$' + replace: '#\1' + # RewriteRule ^/(.*) https://${FQDN}/$1 [L,R=301] + - regexp: '^(RewriteRule \^/\(\.\*\).*)$' + replace: '#\1' + register: _replace_freeipa_rewrites + +- name: Deactivate HTTP RefererError + replace: + path: '/usr/lib/python3.6/site-packages/ipaserver/rpcserver.py' + regexp: '{{ item }}' + replace: '\1pass # \2' + with_items: + - "^([ ]*)(return self.marshal\\(result, RefererError\\(referer)" + register: _replace_rpcserver_referrer + +- name: Reload apache configuration + service: + name: httpd + state: reloaded + when: _replace_freeipa_rewrites.changed or _replace_rpcserver_referrer.changed diff --git a/ansible/roles/freeipa/tasks/users.yml b/ansible/roles/freeipa/tasks/users.yml new file mode 100644 index 000000000..bd1cacad3 --- /dev/null +++ b/ansible/roles/freeipa/tasks/users.yml @@ -0,0 +1,27 @@ +- name: Add users to freeipa + # This uses DNS to find the ipa server, which works as this is running on the enrolled ipa server + community.general.ipa_user: + displayname: "{{ item.displayname | default(omit) }}" + gidnumber: "{{ item.gidnumber | default(omit) }}" + givenname: "{{ item.givenname }}" + #ipa_host + ipa_pass: "{{ freeipa_admin_password | quote }}" + #ipa_port + #ipa_prot + ipa_timeout: "{{ item.ipa_timeout | default(omit) }}" + #ipa_user + krbpasswordexpiration: "{{ item.krbpasswordexpiration | default(omit) }}" + loginshell: "{{ item.loginshell | default(omit) }}" + mail: "{{ item.mail | default(omit) }}" + password: "{{ item.password | default(omit) }}" + sn: "{{ item.sn }}" + sshpubkey: "{{ item.sshpubkey | default(omit) }}" + state: "{{ item.state | default(omit) }}" + telephonenumber: "{{ item.telephonenumber | default(omit) }}" + title: "{{ item.title | default(omit) }}" + uid: "{{ item.name | default(item.uid) }}" + uidnumber: "{{ item.uidnumber | default(omit) }}" + update_password: "{{ item.update_password | default(omit) }}" + userauthtype: "{{ item.userauthtype | default(omit) }}" + #validate_certs + loop: "{{ freeipa_users }}" diff --git a/ansible/roles/freeipa/tasks/validate.yml b/ansible/roles/freeipa/tasks/validate.yml new file mode 100644 index 000000000..238f89e60 --- /dev/null +++ b/ansible/roles/freeipa/tasks/validate.yml @@ -0,0 +1,36 @@ +- name: Get hostname as reported by command + command: hostname + register: _freeipa_validate_hostname + changed_when: false + when: "'freeipa_server' in group_names" + +- name: Ensure hostname is fully-qualified + # see section 2.7 of redhat guide to installing identity management + assert: + that: _freeipa_validate_hostname.stdout | split('.') | length >= 3 + fail_msg: "freeipa_server hostname '{{ _freeipa_validate_hostname.stdout }}' is not fully-qualified (a.b.c)" + when: "'freeipa_server' in group_names" + +- name: Check for virtual servers in httpd configuration of freeipa_server + # e.g. fatimage with OOD config; community.general.ipa_host fails with "401 Unauthorized: No session cookie found" + # https://lists.fedoraproject.org/archives/list/freeipa-users@lists.fedorahosted.org/message/7RH7XDFR35KDPYJ7AQCQI2H2EOWIZCWA/ + find: + path: /etc/httpd/conf.d/ + contains: '/dev/null | base64') }}" diff --git a/ansible/roles/persist_hostkeys/tasks/main.yml b/ansible/roles/persist_hostkeys/tasks/main.yml new file mode 100644 index 000000000..47493220d --- /dev/null +++ b/ansible/roles/persist_hostkeys/tasks/main.yml @@ -0,0 +1,33 @@ +--- + +- name: Ensure hostkeys directory exists on persistent storage + file: + path: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}" + state: directory + owner: root + group: root + mode: 0600 + +- name: Copy hostkeys from persistent storage + # won't fail if no keys are in persistent storage + copy: + src: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}/" + dest: /etc/ssh/ + remote_src: true + +- name: Find hostkeys + find: + path: /etc/ssh/ + patterns: ssh_host_*_key* + register: _find_ssh_keys + +- name: Persist hostkeys + copy: + dest: "{{ appliances_state_dir }}/hostkeys/{{ inventory_hostname }}/" + src: "{{ item }}" + remote_src: true + mode: preserve + loop: "{{ _find_ssh_keys.files | map(attribute='path') }}" + +- meta: reset_connection + diff --git a/ansible/roles/persist_openhpc_secrets/tasks/main.yml b/ansible/roles/persist_openhpc_secrets/tasks/main.yml new file mode 100644 index 000000000..6ae9bcd59 --- /dev/null +++ b/ansible/roles/persist_openhpc_secrets/tasks/main.yml @@ -0,0 +1,35 @@ +--- + +- name: Check if OpenHPC secrets exist in persistent storage + stat: + path: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" + register: openhpc_secrets_stat + +- name: Ensure Ansible facts directories exist + file: + path: "{{ item }}" + state: directory + owner: root + mode: 0600 + loop: + - "{{ appliances_state_dir }}/ansible.facts.d" + - "/etc/ansible/facts.d" + +- name: Write OpenHPC secrets + template: + src: openhpc_secrets.fact + dest: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" + owner: root + mode: 0600 + when: "not openhpc_secrets_stat.stat.exists" + +- name: Symlink persistent facts to facts_path + file: + state: link + src: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" + dest: /etc/ansible/facts.d/openhpc_secrets.fact + owner: root + +- name: Read facts + ansible.builtin.setup: + filter: ansible_local diff --git a/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact b/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact new file mode 100644 index 000000000..9d6de37d8 --- /dev/null +++ b/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact @@ -0,0 +1,9 @@ +{ + "vault_azimuth_user_password": "{{ lookup('password', '/dev/null') }}", + "vault_grafana_admin_password": "{{ lookup('password', '/dev/null') }}", + "vault_elasticsearch_admin_password": "{{ lookup('password', '/dev/null') }}", + "vault_elasticsearch_kibana_password": "{{ lookup('password', '/dev/null') }}", + "vault_mysql_root_password": "{{ lookup('password', '/dev/null') }}", + "vault_mysql_slurm_password": "{{ lookup('password', '/dev/null') }}", + "vault_openhpc_mungekey": "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') | regex_replace('\s+', '') }}" +} diff --git a/ansible/roles/proxy/tasks/main.yml b/ansible/roles/proxy/tasks/main.yml index 5368d6da2..3bc33cfa2 100644 --- a/ansible/roles/proxy/tasks/main.yml +++ b/ansible/roles/proxy/tasks/main.yml @@ -60,4 +60,4 @@ - name: Reset connection to get new /etc/environment meta: reset_connection - # NB: conditionals not supported \ No newline at end of file + # NB: conditionals not supported diff --git a/ansible/roles/resolv_conf/templates/resolv.conf.j2 b/ansible/roles/resolv_conf/templates/resolv.conf.j2 index 59c2c000f..b752046c9 100644 --- a/ansible/roles/resolv_conf/templates/resolv.conf.j2 +++ b/ansible/roles/resolv_conf/templates/resolv.conf.j2 @@ -1,5 +1,7 @@ # Created by slurm appliance ansible/roles/resolv_conf -search {{ openhpc_cluster_name }}.{{ tld }} +{% if cluster_domain_suffix is defined %} +search {{ openhpc_cluster_name }}.{{ cluster_domain_suffix }} +{% endif %} {% for ns in resolv_conf_nameservers[0:3] %} nameserver {{ ns }} diff --git a/ansible/roles/zenith_proxy/defaults/main.yml b/ansible/roles/zenith_proxy/defaults/main.yml new file mode 100644 index 000000000..dbb920c58 --- /dev/null +++ b/ansible/roles/zenith_proxy/defaults/main.yml @@ -0,0 +1,57 @@ +--- + +zenith_registrar_url: "{{ undef(hint = 'zenith_registrar_url is required') }}" +zenith_registrar_verify_ssl: true +zenith_sshd_host: "{{ undef(hint = 'zenith_sshd_host is required') }}" +zenith_sshd_port: 22 + +zenith_proxy_podman_user: "{{ ansible_user }}" + +zenith_proxy_service_name: "{{ undef(hint = 'zenith_proxy_service_name is required') }}" +zenith_proxy_client_service_name: "{{ zenith_proxy_service_name }}-client" +zenith_proxy_mitm_service_name: "{{ zenith_proxy_service_name }}-mitm" + +zenith_proxy_pod_name: "{{ zenith_proxy_service_name }}" +zenith_proxy_client_container_name: "{{ zenith_proxy_client_service_name }}" +zenith_proxy_mitm_container_name: "{{ zenith_proxy_mitm_service_name }}" + +zenith_proxy_image_tag: '0.1.0' + +zenith_proxy_client_image_repository: ghcr.io/stackhpc/zenith-client +zenith_proxy_client_image: "{{ zenith_proxy_client_image_repository }}:{{ zenith_proxy_image_tag }}" + +zenith_proxy_mitm_image_repository: ghcr.io/stackhpc/zenith-proxy +zenith_proxy_mitm_image: "{{ zenith_proxy_mitm_image_repository }}:{{ zenith_proxy_image_tag }}" + +zenith_proxy_upstream_scheme: http +zenith_proxy_upstream_host: "{{ undef(hint = 'zenith_proxy_upstream_host is required') }}" +zenith_proxy_upstream_port: "{{ undef(hint = 'zenith_proxy_upstream_port is required') }}" +zenith_proxy_upstream_read_timeout: + +zenith_proxy_client_token: "{{ undef(hint = 'zenith_proxy_client_token is required') }}" +zenith_proxy_client_auth_skip: false +zenith_proxy_client_auth_params: {} + +zenith_proxy_mitm_enabled: no +zenith_proxy_mitm_listen_port: 8080 +zenith_proxy_mitm_auth_inject: none # valid values are 'basic' and 'bearer' +zenith_proxy_mitm_auth_basic_username: >- + {{ + undef(hint = 'zenith_proxy_mitm_auth_basic_username is required') + if zenith_proxy_mitm_auth_inject == "basic" + else None + }} +zenith_proxy_mitm_auth_basic_password: >- + {{ + undef(hint = 'zenith_proxy_mitm_auth_basic_password is required') + if zenith_proxy_mitm_auth_inject == "basic" + else None + }} +zenith_proxy_mitm_auth_bearer_header_name: Authorization +zenith_proxy_mitm_auth_bearer_header_prefix: Bearer +zenith_proxy_mitm_auth_bearer_token: >- + {{ + undef(hint = 'zenith_proxy_mitm_auth_bearer_token is required') + if zenith_proxy_mitm_auth_inject == "bearer" + else None + }} diff --git a/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh b/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh new file mode 100644 index 000000000..aab232a0a --- /dev/null +++ b/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +##### +# Small script that can be used to attach to the infra container of a pod +# +# Useful in a systemd service that starts a pod in order to track the execution +# +# Accepts a single argument which is the name of the pod whose infra container we should attach to +##### + +set -e + +echo "[INFO] Finding infra container for pod '$1'" +INFRA_CONTAINER_ID="$(podman pod inspect --format '{{.InfraContainerID}}' "$1")" + +echo "[INFO] Attaching to infra container '${INFRA_CONTAINER_ID}'" +exec podman container attach --no-stdin ${INFRA_CONTAINER_ID} diff --git a/ansible/roles/zenith_proxy/tasks/main.yml b/ansible/roles/zenith_proxy/tasks/main.yml new file mode 100644 index 000000000..1a42b0438 --- /dev/null +++ b/ansible/roles/zenith_proxy/tasks/main.yml @@ -0,0 +1,103 @@ +--- + +- name: Install script for attaching to pod infra containers + copy: + src: podman-pod-infra-attach.sh + dest: /usr/bin/ + mode: +x + become: true + +- name: Create systemd unit for Zenith pod + template: + src: pod.service.j2 + dest: /etc/systemd/system/{{ zenith_proxy_service_name }}.service + become: true + register: zenith_proxy_pod_systemd_unit + +- name: Ensure Zenith pod is started and enabled + service: + name: "{{ zenith_proxy_service_name }}.service" + state: "{{ 'restarted' if zenith_proxy_pod_systemd_unit is changed else 'started' }}" + enabled: yes + daemon_reload: "{{ zenith_proxy_pod_systemd_unit is changed }}" + become: true + +- block: + - name: Create systemd unit file for MITM proxy + template: + src: mitm.service.j2 + dest: /etc/systemd/system/{{ zenith_proxy_mitm_service_name }}.service + register: zenith_proxy_mitm_systemd_unit + + - name: Ensure MITM proxy is started and enabled + service: + name: "{{ zenith_proxy_mitm_service_name }}.service" + state: "{{ 'restarted' if zenith_proxy_mitm_systemd_unit is changed else 'started' }}" + enabled: yes + daemon_reload: "{{ zenith_proxy_mitm_systemd_unit is changed }}" + become: true + when: zenith_proxy_mitm_enabled + +- name: Ensure Zenith config directory exists + file: + path: /etc/zenith/{{ zenith_proxy_service_name }} + state: directory + become: true + +- name: Write Zenith client configuration + template: + src: zenith-client.yaml.j2 + dest: /etc/zenith/{{ zenith_proxy_service_name }}/client.yaml + become: true + register: zenith_proxy_client_config_file + +- name: Create directory to persist SSH key + file: + path: "{{ appliances_state_dir }}/{{ zenith_proxy_service_name }}-ssh" + state: directory + owner: "{{ zenith_proxy_podman_user }}" + group: "{{ zenith_proxy_podman_user }}" + become: true + +- name: Initialise Zenith client + # Use a foreground command rather than the podman_container module as I could not + # work out the combination of parameters that produced the desired behaviour :-( + command: >- + podman run + --name {{ zenith_proxy_service_name }}-init + --replace + --volume /etc/zenith/{{ zenith_proxy_service_name }}:/etc/zenith:ro + --volume {{ appliances_state_dir }}/{{ zenith_proxy_service_name }}-ssh:/home/zenith/.ssh + {{ zenith_proxy_client_image }} + zenith-client init + become: true + become_user: "{{ zenith_proxy_podman_user }}" + register: zenith_proxy_client_init + changed_when: zenith_proxy_client_init.rc == 0 + failed_when: >- + zenith_proxy_client_init.rc != 0 and + "token has already been used" not in zenith_proxy_client_init.stderr + +- name: Create systemd unit file for Zenith client + template: + src: client.service.j2 + dest: /etc/systemd/system/{{ zenith_proxy_client_service_name }}.service + become: true + register: zenith_proxy_client_systemd_unit + +- name: Ensure Zenith client is started and enabled + service: + name: "{{ zenith_proxy_client_service_name }}.service" + state: >- + {{ + 'restarted' + if ( + zenith_proxy_client_config_file is changed or + zenith_proxy_client_systemd_unit is changed or + zenith_proxy_client_init is changed + ) + else 'started' + }} + enabled: yes + daemon_reload: "{{ zenith_proxy_client_systemd_unit is changed }}" + become: true diff --git a/ansible/roles/zenith_proxy/templates/client.service.j2 b/ansible/roles/zenith_proxy/templates/client.service.j2 new file mode 100644 index 000000000..809b19b87 --- /dev/null +++ b/ansible/roles/zenith_proxy/templates/client.service.j2 @@ -0,0 +1,34 @@ +[Unit] +Description=Podman {{ zenith_proxy_client_service_name }}.service +Wants=network.target +After=network-online.target +BindsTo={{ zenith_proxy_service_name }}.service +PartOf={{ zenith_proxy_service_name }}.service +After={{ zenith_proxy_service_name }}.service +{% if zenith_proxy_mitm_enabled %} +Wants={{ zenith_proxy_mitm_service_name }}.service +After={{ zenith_proxy_mitm_service_name }}.service +{% endif %} + +[Service] +Environment=PODMAN_SYSTEMD_UNIT=%n +Type=simple +Restart=always +RestartSec=5 +User={{ zenith_proxy_podman_user }} +Group={{ zenith_proxy_podman_user }} +ExecStart=/usr/bin/podman run \ + --cgroups=no-conmon \ + --replace \ + --restart=no \ + --pod {{ zenith_proxy_pod_name }} \ + --name {{ zenith_proxy_client_container_name }} \ + --security-opt label=disable \ + --volume /etc/zenith/{{ zenith_proxy_service_name }}:/etc/zenith:ro \ + --volume {{ appliances_state_dir }}/{{ zenith_proxy_service_name }}-ssh:/home/zenith/.ssh \ + {{ zenith_proxy_client_image }} +ExecStop=/usr/bin/podman stop --ignore -t 10 {{ zenith_proxy_client_container_name }} +ExecStopPost=/usr/bin/podman rm --ignore -f {{ zenith_proxy_client_container_name }} + +[Install] +WantedBy=multi-user.target default.target diff --git a/ansible/roles/zenith_proxy/templates/mitm.service.j2 b/ansible/roles/zenith_proxy/templates/mitm.service.j2 new file mode 100644 index 000000000..d8b3c954b --- /dev/null +++ b/ansible/roles/zenith_proxy/templates/mitm.service.j2 @@ -0,0 +1,46 @@ + + +[Unit] +Description=Podman {{ zenith_proxy_mitm_service_name }}.service +Wants=network.target +After=network-online.target +BindsTo={{ zenith_proxy_service_name }}.service +PartOf={{ zenith_proxy_service_name }}.service +After={{ zenith_proxy_service_name }}.service + +[Service] +Environment=PODMAN_SYSTEMD_UNIT=%n +Type=simple +Restart=always +User={{ zenith_proxy_podman_user }} +Group={{ zenith_proxy_podman_user }} +ExecStart=/usr/bin/podman run \ + --cgroups=no-conmon \ + --replace \ + --restart=no \ + --pod {{ zenith_proxy_pod_name }} \ + --name {{ zenith_proxy_mitm_container_name }} \ + --security-opt label=disable \ + --env ZENITH_PROXY_LISTEN_PORT={{ zenith_proxy_mitm_listen_port }} \ + --env ZENITH_PROXY_UPSTREAM_SCHEME={{ zenith_proxy_upstream_scheme }} \ + --env ZENITH_PROXY_UPSTREAM_HOST={{ zenith_proxy_upstream_host }} \ + --env ZENITH_PROXY_UPSTREAM_PORT={{ zenith_proxy_upstream_port }} \ +{% if zenith_proxy_upstream_read_timeout %} + --env ZENITH_PROXY_READ_TIMEOUT={{ zenith_proxy_upstream_read_timeout }} \ +{% endif %} +{% if zenith_proxy_mitm_auth_inject == "basic" %} + --env ZENITH_PROXY_AUTH_INJECT=basic \ + --env ZENITH_PROXY_AUTH_BASIC_USERNAME={{ zenith_proxy_mitm_auth_basic_username }} \ + --env {{ "ZENITH_PROXY_AUTH_BASIC_PASSWORD={}".format(zenith_proxy_mitm_auth_basic_password) | quote }} \ +{% elif zenith_proxy_mitm_auth_inject == "bearer" %} + --env ZENITH_PROXY_AUTH_INJECT=bearer \ + --env ZENITH_PROXY_AUTH_BEARER_HEADER={{ zenith_proxy_mitm_auth_bearer_header_name }} \ + --env ZENITH_PROXY_AUTH_BEARER_PREFIX={{ zenith_proxy_mitm_auth_bearer_header_prefix }} \ + --env ZENITH_PROXY_AUTH_BEARER_TOKEN={{ zenith_proxy_mitm_auth_bearer_token }} \ +{% endif %} + {{ zenith_proxy_mitm_image }} +ExecStop=/usr/bin/podman stop --ignore -t 10 {{ zenith_proxy_mitm_container_name }} +ExecStopPost=/usr/bin/podman rm --ignore -f {{ zenith_proxy_mitm_container_name }} + +[Install] +WantedBy=multi-user.target default.target diff --git a/ansible/roles/zenith_proxy/templates/pod.service.j2 b/ansible/roles/zenith_proxy/templates/pod.service.j2 new file mode 100644 index 000000000..d46617556 --- /dev/null +++ b/ansible/roles/zenith_proxy/templates/pod.service.j2 @@ -0,0 +1,19 @@ +[Unit] +Description=Podman {{ zenith_proxy_service_name }}.service +Wants=network.target +After=network-online.target + +[Service] +Environment=PODMAN_SYSTEMD_UNIT=%n +Type=simple +Restart=always +User={{ zenith_proxy_podman_user }} +Group={{ zenith_proxy_podman_user }} +ExecStartPre=/usr/bin/podman pod create --replace --name {{ zenith_proxy_pod_name }} +ExecStartPre=/usr/bin/podman pod start {{ zenith_proxy_pod_name }} +ExecStart=/usr/bin/podman-pod-infra-attach.sh {{ zenith_proxy_pod_name }} +ExecStop=/usr/bin/podman pod stop --ignore -t 10 {{ zenith_proxy_pod_name }} +ExecStopPost=/usr/bin/podman pod rm --ignore -f {{ zenith_proxy_pod_name }} + +[Install] +WantedBy=multi-user.target default.target diff --git a/ansible/roles/zenith_proxy/templates/zenith-client.yaml.j2 b/ansible/roles/zenith_proxy/templates/zenith-client.yaml.j2 new file mode 100644 index 000000000..c037d7dc6 --- /dev/null +++ b/ansible/roles/zenith_proxy/templates/zenith-client.yaml.j2 @@ -0,0 +1,27 @@ +ssh_identity_path: /home/zenith/.ssh/id_zenith + +# Init options +registrar_url: {{ zenith_registrar_url }} +token: {{ zenith_proxy_client_token }} +verify_ssl: {{ 'yes' if zenith_registrar_verify_ssl else 'no' }} + +# Connect options +server_address: {{ zenith_sshd_host }} +server_port: {{ zenith_sshd_port }} +{% if zenith_proxy_mitm_enabled %} +backend_protocol: http +forward_to_host: 127.0.0.1 +forward_to_port: {{ zenith_proxy_mitm_listen_port }} +{% else %} +backend_protocol: {{ zenith_proxy_upstream_scheme }} +forward_to_host: {{ zenith_proxy_upstream_host }} +forward_to_port: {{ zenith_proxy_upstream_port }} +{% endif %} +{% if zenith_proxy_upstream_read_timeout %} +read_timeout: {{ zenith_proxy_upstream_read_timeout }} +{% endif %} +skip_auth: {{ 'yes' if zenith_proxy_client_auth_skip else 'no' }} +{% if zenith_proxy_client_auth_params %} +auth_params: + {{ zenith_proxy_client_auth_params | to_nice_yaml | indent(2) }} +{% endif %} diff --git a/ansible/site.yml b/ansible/site.yml index 37befa547..1804a2365 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -2,31 +2,36 @@ - name: Run pre.yml hook vars: - appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" + # hostvars not available here, so have to recalculate environment root: + appliances_environment_root: "{{ ansible_inventory_sources | last | dirname }}" hook_path: "{{ appliances_environment_root }}/hooks/pre.yml" import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists - import_playbook: validate.yml + when: "{{ appliances_validate | default(true) }}" + - import_playbook: bootstrap.yml - name: Run post-bootstrap.yml hook vars: - appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" + # hostvars not available here, so have to recalculate environment root: + appliances_environment_root: "{{ ansible_inventory_sources | last | dirname }}" hook_path: "{{ appliances_environment_root }}/hooks/post-bootstrap.yml" import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists +- import_playbook: iam.yml - import_playbook: filesystems.yml - import_playbook: extras.yml - import_playbook: slurm.yml - import_playbook: portal.yml - import_playbook: monitoring.yml -- import_playbook: iam.yml - name: Run post.yml hook vars: - appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" + # hostvars not available here, so have to recalculate environment root: + appliances_environment_root: "{{ ansible_inventory_sources | last | dirname }}" hook_path: "{{ appliances_environment_root }}/hooks/post.yml" import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists diff --git a/ansible/validate.yml b/ansible/validate.yml index d294e98e5..866d95d48 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -4,6 +4,7 @@ - name: Ensure control node is in inventory hosts: all + gather_facts: false tasks: - assert: that: groups['control'] | length @@ -11,6 +12,7 @@ - name: Validate openhpc configuration hosts: openhpc + gather_facts: false tags: openhpc tasks: - assert: @@ -22,6 +24,7 @@ - name: Validate podman configuration hosts: podman + gather_facts: false tags: podman tasks: - import_role: @@ -31,6 +34,7 @@ - name: Validate filebeat configuration hosts: filebeat + gather_facts: false tags: filebeat tasks: - import_role: @@ -75,3 +79,11 @@ - openondemand - openondemand_server - grafana + +- name: Validate freeipa configuration + hosts: freeipa + tags: freeipa + tasks: + - import_role: + name: freeipa + tasks_from: validate.yml diff --git a/environments/.caas/README.md b/environments/.caas/README.md new file mode 100644 index 000000000..4a08433b0 --- /dev/null +++ b/environments/.caas/README.md @@ -0,0 +1,18 @@ +# Caas cluster + +Environment for default Azimuth Slurm. This is not intended to be manually deployed. + +Non-standard things for this environment: +- There is no activate script. +- `ansible.cgf` is provided in the repo root, as expected by the caas operator. +- `ANSIBLE_INVENTORY` is set in the cluster type template, using a path relative to the + runner project directory: + + azimuth_caas_stackhpc_slurm_appliance_template: + ... + envVars: + ANSIBLE_INVENTORY: environments/common/inventory,environments/.caas/inventory + + Ansible then defines `ansible_inventory_sources` which contains absolute paths, and + that is used to derive the `appliances_environment_root` and + `appliances_repository_root`. diff --git a/environments/.caas/ansible.cfg b/environments/.caas/ansible.cfg new file mode 100644 index 000000000..54a1c2a50 --- /dev/null +++ b/environments/.caas/ansible.cfg @@ -0,0 +1,15 @@ +[defaults] +any_errors_fatal = True +stdout_callback = debug +stderr_callback = debug +gathering = smart +forks = 30 +host_key_checking = False +inventory = ../common/inventory,inventory +collections_path = ../../ansible/collections +roles_path = ../../ansible/roles +filter_plugins = ../../ansible/filter_plugins + +[ssh_connection] +ssh_args = -o ControlMaster=auto ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +pipelining = True diff --git a/environments/.caas/assets/ood-icon.png b/environments/.caas/assets/ood-icon.png new file mode 100644 index 000000000..b5f4b6ea7 Binary files /dev/null and b/environments/.caas/assets/ood-icon.png differ diff --git a/environments/.stackhpc/cloud_init/.gitkeep b/environments/.caas/hooks/.gitkeep similarity index 100% rename from environments/.stackhpc/cloud_init/.gitkeep rename to environments/.caas/hooks/.gitkeep diff --git a/environments/.caas/hooks/post.yml b/environments/.caas/hooks/post.yml new file mode 100644 index 000000000..7aafe409f --- /dev/null +++ b/environments/.caas/hooks/post.yml @@ -0,0 +1,86 @@ +- name: Persist login hostkey across rebuilds +# Need NFS for this so can't do it before the appliance plays + hosts: login + gather_facts: no + become: yes + roles: + - persist_hostkeys + +# Configure the Zenith clients that are required +# First, ensure that podman is installed on all hosts that will run Zenith clients +- hosts: zenith,!podman + tasks: + - import_role: + name: podman + tasks_from: prereqs.yml + - import_role: + name: podman + tasks_from: config.yml + +- hosts: grafana + tasks: + - name: Deploy the Zenith client for Grafana + include_role: + name: zenith_proxy + vars: + zenith_proxy_service_name: zenith-monitoring + # Use the IP address for the upstream host + zenith_proxy_upstream_host: "{{ ansible_host }}" # IP + zenith_proxy_upstream_port: "{{ grafana_port }}" + zenith_proxy_client_token: "{{ zenith_token_monitoring }}" + zenith_proxy_client_auth_params: + tenancy-id: "{{ openstack_project_id }}" + zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_auth_inject: basic + zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" + zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" + when: zenith_subdomain_monitoring is defined + +- hosts: openondemand + tasks: + - name: Deploy the Zenith client for OOD + include_role: + name: zenith_proxy + vars: + zenith_proxy_service_name: zenith-ood + # Use the IP address for the upstream host + zenith_proxy_upstream_scheme: https + zenith_proxy_upstream_host: "{{ ansible_host }}" # IP + zenith_proxy_upstream_port: 443 + zenith_proxy_client_token: "{{ zenith_token_ood }}" + zenith_proxy_client_auth_params: + tenancy-id: "{{ openstack_project_id }}" + zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_auth_inject: basic + zenith_proxy_mitm_auth_basic_username: azimuth + zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" + when: zenith_subdomain_ood is defined + +# Run hpctests if set in UI +- hosts: hpctests[0] + become: false + gather_facts: false + tasks: + - import_role: + name: hpctests + when: cluster_run_validation | default(false) | bool + +# Write the outputs as the final task +- hosts: localhost + tasks: + - debug: var=outputs + vars: + # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, + # so we have to repeat logic here unfortunately + outputs: >- + {{- + { "cluster_access_ip": hostvars[groups['openstack'][0]].cluster_gateway_ip } | + combine( + { + "openondemand_url": "https://" ~ (hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-')) ~ ".sslip.io", + "azimuth_user_password": hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password + } + if zenith_fqdn_ood is not defined + else {} + ) + }} \ No newline at end of file diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml new file mode 100644 index 000000000..05b0255c8 --- /dev/null +++ b/environments/.caas/hooks/pre.yml @@ -0,0 +1,45 @@ +--- + +# Provision the infrastructure using Terraform +- name: Provision infrastructure + hosts: openstack + roles: + - cluster_infra + +# Ensure that the secrets are generated and persisted on the control host +- name: Generate and persist secrets + hosts: control + gather_facts: no + become: yes + roles: + - persist_openhpc_secrets + +# validate.yml asserts presence of a control group which doesn't exist when +# destroying infra, so only validate when we're not destroying +- hosts: openstack + gather_facts: no + become: no + tasks: + - set_fact: + appliances_validate: false + when: "cluster_state | default('') == 'absent'" + +# TODO: FIXME: maybe by doing the user move in cloud-init? +# The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run +# This can disrupt the SSH connection, particularly because we use the login host as a jump host +# So we move the home directory on the login node and reset the connections first +- hosts: login + gather_facts: false + tasks: + - name: Set up Ansible user + user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" + become_method: "sudo" + # Need to change working directory otherwise we try to switch back to non-existent directory. + become_flags: '-i' + become: true + +- hosts: cluster + gather_facts: no + tasks: + - name: Reset persistent SSH connections + meta: reset_connection diff --git a/environments/.caas/inventory/everything b/environments/.caas/inventory/everything new file mode 120000 index 000000000..dc66b9576 --- /dev/null +++ b/environments/.caas/inventory/everything @@ -0,0 +1 @@ +../../../environments/common/layouts/everything \ No newline at end of file diff --git a/environments/.caas/inventory/extra_groups b/environments/.caas/inventory/extra_groups new file mode 100644 index 000000000..a6f06b7a7 --- /dev/null +++ b/environments/.caas/inventory/extra_groups @@ -0,0 +1,9 @@ +[basic_users:children] +cluster + +[etc_hosts:children] +cluster + +[zenith:children] +grafana +openondemand diff --git a/environments/skeleton/{{cookiecutter.environment}}/cloud_init/.gitkeep b/environments/.caas/inventory/group_vars/all/.gitkeep similarity index 100% rename from environments/skeleton/{{cookiecutter.environment}}/cloud_init/.gitkeep rename to environments/.caas/inventory/group_vars/all/.gitkeep diff --git a/environments/.caas/inventory/group_vars/all/basic_users.yml b/environments/.caas/inventory/group_vars/all/basic_users.yml new file mode 100644 index 000000000..6105df821 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/basic_users.yml @@ -0,0 +1,6 @@ +basic_users_users: + - name: azimuth + # Hash the password with a salt that is different for each host + password: "{{ vault_azimuth_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" + uid: 1005 + public_key: "{{ cluster_user_ssh_public_key }}" diff --git a/environments/.caas/inventory/group_vars/all/cluster.yml b/environments/.caas/inventory/group_vars/all/cluster.yml new file mode 100644 index 000000000..b9ea63586 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/cluster.yml @@ -0,0 +1,22 @@ +# Account for the fact we are running outside of the expected environment system: +caas_inventory: "{{ ansible_inventory_sources | last }}" # ansible_inventory_sources is absolute +appliances_environment_root: "{{ caas_inventory | dirname }}" +appliances_repository_root: "{{ appliances_environment_root | dirname | dirname }}" + +# Read the secrets from the Ansible local facts on the control host +vault_azimuth_user_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password }}" +vault_grafana_admin_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_grafana_admin_password }}" +vault_elasticsearch_admin_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_elasticsearch_admin_password }}" +vault_elasticsearch_kibana_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_elasticsearch_kibana_password }}" +vault_mysql_root_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_mysql_root_password }}" +vault_mysql_slurm_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_mysql_slurm_password }}" +vault_openhpc_mungekey: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_openhpc_mungekey }}" + +# Override this to cope with the case where the podman group just doesn't exist +appliances_local_users_podman_enable: "{{ groups.get('podman', []) | length > 0 }}" + +# The server name for Open OnDemand depends on whether Zenith is enabled or not +openondemand_servername_default: "{{ hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-') ~ '.sslip.io' }}" +openondemand_servername: "{{ zenith_fqdn_ood | default(openondemand_servername_default) }}" + +appliances_state_dir: /var/lib/state diff --git a/environments/.caas/inventory/group_vars/all/grafana.yml b/environments/.caas/inventory/group_vars/all/grafana.yml new file mode 100644 index 000000000..10fdc926c --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/grafana.yml @@ -0,0 +1 @@ +grafana_auth_anonymous: "{{ groups['openondemand'] | count > 0 }}" diff --git a/environments/.caas/inventory/group_vars/all/hpctests.yml b/environments/.caas/inventory/group_vars/all/hpctests.yml new file mode 100644 index 000000000..a31437be3 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/hpctests.yml @@ -0,0 +1,6 @@ +# Skip plotting pingpong as matplotlib not in runner environment +hpctests_pingpong_plot: false + +# In Azimuth, the Ansible controller is an ephemeral pod, so all that matters is that +# this is a location that is writable by the container user +hpctests_outdir: "{{ playbook_dir }}/.tmp/hpctests" diff --git a/environments/.caas/inventory/group_vars/all/nfs.yml b/environments/.caas/inventory/group_vars/all/nfs.yml new file mode 100644 index 000000000..2ea3abe57 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/nfs.yml @@ -0,0 +1,16 @@ +nfs_server: "{{ nfs_server_default }}" + +nfs_configurations: + - comment: Export /exports/home from Slurm control node as /home + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}" + nfs_export: "/exports/home" # assumes skeleton TF is being used + nfs_client_mnt_point: "/home" + - comment: Export /var/lib/state from Slurm control node to OOD + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: "{{ inventory_hostname in groups['openondemand'] }}" + nfs_export: "{{ appliances_state_dir }}" + nfs_client_mnt_point: "{{ appliances_state_dir }}" + nfs_client_mnt_options: "x-systemd.required-by=zenith-ood.service,x-systemd.before=zenith-ood.service" diff --git a/environments/.caas/inventory/group_vars/all/openhpc.yml b/environments/.caas/inventory/group_vars/all/openhpc.yml new file mode 100644 index 000000000..624402f9f --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/openhpc.yml @@ -0,0 +1,5 @@ +openhpc_cluster_name: "{{ cluster_name }}" + +# Provision a single "standard" compute partition using the supplied +# node count and flavor +openhpc_slurm_partitions: "{{ hostvars[groups['openstack'][0]]['openhpc_slurm_partitions'] }}" diff --git a/environments/.caas/inventory/group_vars/all/openondemand.yml b/environments/.caas/inventory/group_vars/all/openondemand.yml new file mode 100644 index 000000000..60461bd61 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/openondemand.yml @@ -0,0 +1,9 @@ +--- +openondemand_auth: basic_pam +openondemand_jupyter_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" +openondemand_desktop_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" + +httpd_listen_addr_port: + - 80 + - 443 + diff --git a/environments/.caas/inventory/group_vars/all/prometheus.yml b/environments/.caas/inventory/group_vars/all/prometheus.yml new file mode 100644 index 000000000..eb28fda63 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/prometheus.yml @@ -0,0 +1,4 @@ +--- + +# Set Prometheus storage retention size +prometheus_storage_retention_size: "{{ metrics_db_maximum_size }}GB" diff --git a/environments/.caas/inventory/group_vars/all/selinux.yml b/environments/.caas/inventory/group_vars/all/selinux.yml new file mode 100644 index 000000000..1f1098126 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/selinux.yml @@ -0,0 +1 @@ +selinux_state: disabled \ No newline at end of file diff --git a/environments/.caas/inventory/group_vars/all/zenith.yml b/environments/.caas/inventory/group_vars/all/zenith.yml new file mode 100644 index 000000000..56dd0ca16 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/zenith.yml @@ -0,0 +1 @@ +zenith_proxy_podman_user: podman diff --git a/environments/.caas/inventory/group_vars/openstack.yml b/environments/.caas/inventory/group_vars/openstack.yml new file mode 100644 index 000000000..836078e10 --- /dev/null +++ b/environments/.caas/inventory/group_vars/openstack.yml @@ -0,0 +1,28 @@ +# The default Terraform state key for backends that support it +terraform_state_key: "cluster/{{ cluster_id }}/tfstate" + +# Set up the terraform backend +terraform_backend_type: "{{ 'consul' if 'CONSUL_HTTP_ADDR' in ansible_env else 'local' }}" +terraform_backend_config_defaults: + consul: + path: "{{ terraform_state_key }}" + gzip: "true" + local: {} +terraform_backend_config: "{{ terraform_backend_config_defaults[terraform_backend_type] }}" + +terraform_binary_directory: "{{ appliances_environment_root }}/bin" +terraform_project_path: "{{ playbook_dir }}/terraform" + +terraform_state: "{{ cluster_state | default('present') }}" +cluster_ssh_user: rocky + +# Set the size of the state volume to metrics_db_maximum_size + 10 +state_volume_size: "{{ metrics_db_maximum_size + 10 }}" + +# Provision a single "standard" compute partition using the supplied +# node count and flavor +openhpc_slurm_partitions: + - name: "standard" + count: "{{ compute_count }}" + flavor: "{{ compute_flavor }}" + default: "YES" diff --git a/environments/.caas/inventory/hosts b/environments/.caas/inventory/hosts new file mode 100644 index 000000000..88ce71000 --- /dev/null +++ b/environments/.caas/inventory/hosts @@ -0,0 +1,2 @@ +[openstack] +localhost ansible_connection=local ansible_python_interpreter=/usr/bin/python3 diff --git a/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml new file mode 100644 index 000000000..d210fec47 --- /dev/null +++ b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml @@ -0,0 +1,117 @@ +name: "slurm" +label: "Slurm" +description: >- + Batch cluster running the Slurm workload manager, the Open + OnDemand web interface, and custom monitoring. +logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png + +parameters: + - name: cluster_floating_ip + label: External IP + description: The external IP to use for the login node. + kind: cloud.ip + immutable: true + + - name: compute_count + label: Compute node count + description: The number of compute nodes in the cluster. + kind: integer + options: + min: 1 + default: 3 + + - name: compute_flavor + label: Compute node size + description: The size to use for the compute node. + kind: "cloud.size" + immutable: true + options: + min_ram: 2048 + min_disk: 20 + + - name: home_volume_size + label: Home volume size (GB) + description: The size of the cloud volume to use for home directories + kind: integer + immutable: true + options: + min: 10 + default: 100 + + - name: use_home_volume_type_fast + label: Provision high-performance storage for home directories + description: | + If a high-performance storage type is available to the Slurm platform, + use it for cluster home directories. If no high-performance storage type + is available, this option has no effect and a standard cloud volume will + be provisioned for home directories. + kind: boolean + required: false + default: true + options: + checkboxLabel: Put home directories on high-performance storage? + + - name: metrics_db_maximum_size + label: Metrics database size (GB) + description: | + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be + discarded to ensure that the database does not grow larger than this size. + + **A cloud volume of this size +10GB will be created to hold and persist the metrics + database and important Slurm files.** + kind: integer + immutable: true + options: + min: 10 + default: 10 + + - name: cluster_run_validation + label: Post-configuration validation + description: >- + If selected, post-configuration jobs will be executed to validate the core functionality + of the cluster when it is re-configured. + kind: boolean + required: false + default: true + options: + checkboxLabel: Run post-configuration validation? + +usage_template: |- + # Accessing the cluster using Open OnDemand + + [Open OnDemand](https://openondemand.org/) is a web portal for managing HPC jobs, including graphical + environments such as [Jupyter Notebooks](https://jupyter.org/). + + {% if cluster.outputs.openondemand_url %} + The Open OnDemand portal for this cluster is available at + [{{ cluster.outputs.openondemand_url.slice(8) }}]({{ cluster.outputs.openondemand_url }}). + + Enter the username `azimuth` and password `{{ cluster.outputs.azimuth_user_password }}` when prompted. + {% else %} + The Open OnDemand portal for this cluster can be accessed from the services list. + {% endif %} + + # Accessing the cluster using SSH + + The cluster can be accessed over SSH via the external IP. The SSH public key of the user that + deployed the cluster is injected into the `azimuth` user: + + ``` + $ ssh azimuth@{{ cluster.outputs.cluster_access_ip | default('[cluster ip]') }} + [azimuth@{{ cluster.name }}-login-0 ~]$ sinfo + PARTITION AVAIL TIMELIMIT NODES STATE NODELIST + compute* up 60-00:00:0 {{ "%3s" | format(cluster.parameter_values.compute_count) }} idle {{ cluster.name }}-compute-[0-{{ cluster.parameter_values.compute_count - 1 }}] + ``` + + The `rocky` user can be accessed the same way and has passwordless `sudo` enabled. + + SSH access can be granted to additional users by placing their SSH public key in `~azimuth/.ssh/authorized_keys`. + +services: + - name: ood + label: Open OnDemand + icon_url: https://github.com/stackhpc/ansible-slurm-appliance/raw/main/environments/.caas/assets/ood-icon.png + - name: monitoring + label: Monitoring + icon_url: https://raw.githubusercontent.com/cncf/artwork/master/projects/prometheus/icon/color/prometheus-icon-color.png + diff --git a/environments/.caas/ui-meta/slurm-infra.yml b/environments/.caas/ui-meta/slurm-infra.yml new file mode 100644 index 000000000..250b96469 --- /dev/null +++ b/environments/.caas/ui-meta/slurm-infra.yml @@ -0,0 +1,103 @@ +name: "slurm" +label: "Slurm" +description: >- + Batch cluster running the Slurm workload manager, the Open + OnDemand web interface, and custom monitoring. +logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png + +parameters: + - name: cluster_floating_ip + label: External IP + description: The external IP to use for the login node. + kind: cloud.ip + immutable: true + + - name: compute_count + label: Compute node count + description: The number of compute nodes in the cluster. + kind: integer + options: + min: 1 + default: 3 + + - name: compute_flavor + label: Compute node size + description: The size to use for the compute node. + kind: "cloud.size" + immutable: true + options: + min_ram: 2048 + min_disk: 20 + + - name: home_volume_size + label: Home volume size (GB) + description: The size of the cloud volume to use for home directories + kind: integer + immutable: true + options: + min: 10 + default: 100 + + - name: metrics_db_maximum_size + label: Metrics database size (GB) + description: | + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be + discarded to ensure that the database does not grow larger than this size. + + **A cloud volume of this size +10GB will be created to hold and persist the metrics + database and important Slurm files.** + kind: integer + immutable: true + options: + min: 10 + default: 10 + + - name: cluster_run_validation + label: Post-configuration validation + description: >- + If selected, post-configuration jobs will be executed to validate the core functionality + of the cluster when it is re-configured. + kind: boolean + required: false + default: true + options: + checkboxLabel: Run post-configuration validation? + +usage_template: |- + # Accessing the cluster using Open OnDemand + + [Open OnDemand](https://openondemand.org/) is a web portal for managing HPC jobs, including graphical + environments such as [Jupyter Notebooks](https://jupyter.org/). + + {% if cluster.outputs.openondemand_url %} + The Open OnDemand portal for this cluster is available at + [{{ cluster.outputs.openondemand_url.slice(8) }}]({{ cluster.outputs.openondemand_url }}). + + Enter the username `azimuth` and password `{{ cluster.outputs.azimuth_user_password }}` when prompted. + {% else %} + The Open OnDemand portal for this cluster can be accessed from the services list. + {% endif %} + + # Accessing the cluster using SSH + + The cluster can be accessed over SSH via the external IP. The SSH public key of the user that + deployed the cluster is injected into the `azimuth` user: + + ``` + $ ssh azimuth@{{ cluster.outputs.cluster_access_ip | default('[cluster ip]') }} + [azimuth@{{ cluster.name }}-login-0 ~]$ sinfo + PARTITION AVAIL TIMELIMIT NODES STATE NODELIST + compute* up 60-00:00:0 {{ "%3s" | format(cluster.parameter_values.compute_count) }} idle {{ cluster.name }}-compute-[0-{{ cluster.parameter_values.compute_count - 1 }}] + ``` + + The `rocky` user can be accessed the same way and has passwordless `sudo` enabled. + + SSH access can be granted to additional users by placing their SSH public key in `~azimuth/.ssh/authorized_keys`. + +services: + - name: ood + label: Open OnDemand + icon_url: https://github.com/stackhpc/ansible-slurm-appliance/raw/main/environments/.caas/assets/ood-icon.png + - name: monitoring + label: Monitoring + icon_url: https://raw.githubusercontent.com/cncf/artwork/master/projects/prometheus/icon/color/prometheus-icon-color.png diff --git a/environments/.stackhpc/activate b/environments/.stackhpc/activate index e74031095..2a58b40e4 100644 --- a/environments/.stackhpc/activate +++ b/environments/.stackhpc/activate @@ -1,8 +1,7 @@ export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" -APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) -export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" +export PS1="$(basename $APPLIANCES_ENVIRONMENT_ROOT)/ ${PS1}" export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index cc87628e7..d1da6f4a8 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -7,3 +7,18 @@ compute [etc_hosts:children] cluster + +# -- Example of enabling FreeIPA with an in-appliance (dev-only) server +# NB: The etc_hosts and basic_users group definitions above should be commented out +# The freeipa_* hosts will pick up configuration from environments/.stackhpc/inventory/group_vars/all/freeipa.yml + +# [freeipa_server:children] +# control +# +# [freeipa_client:children] +# login +# compute +# +# [resolv_conf:children] +# freeipa_client +# --- end of FreeIPA example --- diff --git a/environments/.stackhpc/inventory/group_vars/all/basic_users.yml b/environments/.stackhpc/inventory/group_vars/all/basic_users.yml index 2f90a1d60..ae416cf72 100644 --- a/environments/.stackhpc/inventory/group_vars/all/basic_users.yml +++ b/environments/.stackhpc/inventory/group_vars/all/basic_users.yml @@ -1,7 +1,6 @@ -# has to be defined on 'all' group so localhost can template out for cloud-init -testuser_password: "{{ lookup('env', 'TESTUSER_PASSWORD') | default(vault_testuser_password, true) }}" +test_user_password: "{{ lookup('env', 'TESTUSER_PASSWORD') | default(vault_testuser_password, true) }}" # CI uses env, debug can set vault_testuser_password basic_users_users: - name: testuser # can't use rocky as $HOME isn't shared! - password: "{{ testuser_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent + password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent uid: 1005 diff --git a/environments/.stackhpc/inventory/group_vars/all/freeipa.yml b/environments/.stackhpc/inventory/group_vars/all/freeipa.yml new file mode 100644 index 000000000..4b3750650 --- /dev/null +++ b/environments/.stackhpc/inventory/group_vars/all/freeipa.yml @@ -0,0 +1,12 @@ +# This file provides examples of using freeipa role variables. These are NOT functional in CI as freeipa_{server,client} groups are not defined. + +# NB: Users defined this way have expired passwords +freeipa_users: + - name: testuser # can't use rocky as $HOME isn't shared! + password: "{{ test_user_password }}" + givenname: test + sn: test + +# freeipa_client hosts must use a FreeIPA server for name resolution - requires hosts to be in group `resolv_conf`. +resolv_conf_nameservers: + - "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" diff --git a/environments/.stackhpc/inventory/group_vars/basic_users/overrides.yml b/environments/.stackhpc/inventory/group_vars/basic_users/overrides.yml deleted file mode 100644 index ae416cf72..000000000 --- a/environments/.stackhpc/inventory/group_vars/basic_users/overrides.yml +++ /dev/null @@ -1,6 +0,0 @@ -test_user_password: "{{ lookup('env', 'TESTUSER_PASSWORD') | default(vault_testuser_password, true) }}" # CI uses env, debug can set vault_testuser_password - -basic_users_users: - - name: testuser # can't use rocky as $HOME isn't shared! - password: "{{ test_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}" # idempotent - uid: 1005 diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 36b4d3c6a..598dbf59a 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -3,6 +3,7 @@ ansible_user: rocky appliances_repository_root: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}" appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" +appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only #appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform # Address(ip/dns) for internal communication between services. This is diff --git a/environments/common/inventory/group_vars/all/freeipa_server.yml b/environments/common/inventory/group_vars/all/freeipa_server.yml new file mode 100644 index 000000000..7f0fee713 --- /dev/null +++ b/environments/common/inventory/group_vars/all/freeipa_server.yml @@ -0,0 +1,7 @@ +# See ansible/roles/freeipa/README.md +# These vars are only used when freeipa_server is enabled. They are not required when enabling only freeipa_client +freeipa_realm: "{{ openhpc_cluster_name | upper }}.{{ cluster_domain_suffix | upper }}" +freeipa_ds_password: "{{ vault_freeipa_ds_password }}" +freeipa_admin_password: "{{ vault_freeipa_admin_password }}" +# the below doesn't use ansible_default_ipv4.address as that requires facts, and allows for templating when group freeipa_server is empty +freeipa_server_ip: "{{ hostvars[groups['freeipa_server'].0].ansible_host if groups['freeipa_server'] else false }}" diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index c63191095..a675279ba 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -15,12 +15,14 @@ openondemand_host_regex: "{{ (groups['compute'] + groups['grafana']) | to_ood_re ondemand_package: ondemand-3.0.1 -openondemand_dashboard_links: # TODO: should really only be deployed if grafana is deployed and proxying configured +# Add grafana to dashboard links to OOD only if grafana group is available +openondemand_dashboard_links_grafana: - name: Grafana app_name: grafana category: Monitoring description: Dashboards url: "{{ grafana_url_openondemand_proxy }}" +openondemand_dashboard_links: "{{ openondemand_dashboard_links_grafana if groups['grafana'] | length > 0 }}" openondemand_clusters: slurm: @@ -52,21 +54,23 @@ openondemand_clusters: export -f xfce4-session %s set_host: host=$(hostname -s) - custom: - # embed grafana panels in Jobs app: https://osc.github.io/ood-documentation/latest/customization.html#grafana-support - grafana: - host: "{{ grafana_url }}" - orgId: 1 - dashboard: - name: "node-exporter-slurm" - uid: "node-exporter-slurm" - panels: - cpu: 77 - memory: 78 - labels: - cluster: "cluster" - host: "host" - jobid: "jobid" + custom: "{{ openondemand_clusters_grafana if groups['grafana'] | length > 0 else {} }}" + +openondemand_clusters_grafana: + # embed grafana panels in Jobs app: https://osc.github.io/ood-documentation/latest/customization.html#grafana-support + grafana: + host: "{{ grafana_url }}" + orgId: 1 + dashboard: + name: "node-exporter-slurm" + uid: "node-exporter-slurm" + panels: + cpu: 77 + memory: 78 + labels: + cluster: "cluster" + host: "host" + jobid: "jobid" ood_install_apps_defaults: jupyter: @@ -174,7 +178,7 @@ openondemand_scrape_configs: - targets: - "{{ openondemand_address }}:9301" labels: - environment: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_NAME') }}" + environment: "{{ appliances_environment_name }}" service: "openondemand" openondemand_dashboard: diff --git a/environments/common/inventory/group_vars/all/update.yml b/environments/common/inventory/group_vars/all/update.yml index b409ea3d6..715d418c7 100644 --- a/environments/common/inventory/group_vars/all/update.yml +++ b/environments/common/inventory/group_vars/all/update.yml @@ -9,4 +9,4 @@ update_exclude: - apptainer # see https://github.com/stackhpc/ansible-slurm-appliance/pull/245 update_disablerepo: omit # Log changes during update here on localhost: -update_log_path: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/logs/{{ inventory_hostname }}-updates.log" +update_log_path: "{{ appliances_environment_root }}/logs/{{ inventory_hostname }}-updates.log" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 475052d3b..59629c04e 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -104,8 +104,20 @@ grafana control prometheus +[freeipa_server] +# Hosts to be a FreeIPA server. **NB**: Intended only for test/development use. See ansible/roles/freeipa/README.md + +[freeipa_client] +# Hosts to be a FreeIPA client. See ansible/roles/freeipa/README.md + +[freeipa:children] +# Allows defining variables common to freeipa_server and _client +freeipa_server +freeipa_client + [cuda] # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md + [resolv_conf] # Allows defining nameservers in /etc/resolv.conf - see ansible/roles/resolv_conf/README.md diff --git a/environments/skeleton/{{cookiecutter.environment}}/activate b/environments/skeleton/{{cookiecutter.environment}}/activate index e74031095..2a58b40e4 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/activate +++ b/environments/skeleton/{{cookiecutter.environment}}/activate @@ -1,8 +1,7 @@ export APPLIANCES_ENVIRONMENT_ROOT=$(dirname $(realpath ${BASH_SOURCE[0]:-${(%):-%x}})) echo "Setting APPLIANCES_ENVIRONMENT_ROOT to $APPLIANCES_ENVIRONMENT_ROOT" -APPLIANCES_ENVIRONMENT_NAME=$(basename $APPLIANCES_ENVIRONMENT_ROOT) -export PS1="${APPLIANCES_ENVIRONMENT_NAME}/ ${PS1}" +export PS1="$(basename $APPLIANCES_ENVIRONMENT_ROOT)/ ${PS1}" export APPLIANCES_REPO_ROOT=$(realpath "$APPLIANCES_ENVIRONMENT_ROOT/../..") echo "Setting APPLIANCES_REPO_ROOT to $APPLIANCES_REPO_ROOT" diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/inventory.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/inventory.tf index d7298015c..5f195caf2 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/inventory.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/inventory.tf @@ -2,13 +2,13 @@ resource "local_file" "hosts" { content = templatefile("${path.module}/inventory.tpl", { "cluster_name": var.cluster_name, - "control": openstack_networking_port_v2.control, + "cluster_domain_suffix": var.cluster_domain_suffix, + "control_instances": openstack_compute_instance_v2.control + "login_instances": openstack_compute_instance_v2.login + "compute_instances": openstack_compute_instance_v2.compute "state_dir": var.state_dir, - "logins": openstack_networking_port_v2.login, - "computes": openstack_networking_port_v2.compute, "compute_types": var.compute_types, "compute_nodes": var.compute_nodes, - "subnet": data.openstack_networking_subnet_v2.cluster_subnet, }, ) filename = "../inventory/hosts" diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/inventory.tpl b/environments/skeleton/{{cookiecutter.environment}}/terraform/inventory.tpl index 43c3250ee..11b2cfd45 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/inventory.tpl +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/inventory.tpl @@ -1,21 +1,24 @@ [all:vars] openhpc_cluster_name=${cluster_name} +cluster_domain_suffix=${cluster_domain_suffix} [control] -${control.name} ansible_host=${control.all_fixed_ips[0]} +%{ for control in control_instances ~} +${ control.name } ansible_host=${[for n in control.network: n.fixed_ip_v4 if n.access_network][0]} node_fqdn=${ control.name }.${cluster_name}.${cluster_domain_suffix} +%{ endfor ~} [control:vars] # NB needs to be set on group not host otherwise it is ignored in packer build! appliances_state_dir=${state_dir} [login] -%{ for login in logins ~} -${login.name} ansible_host=${login.all_fixed_ips[0]} +%{ for login in login_instances ~} +${ login.name } ansible_host=${[for n in login.network: n.fixed_ip_v4 if n.access_network][0]} node_fqdn=${ login.name }.${cluster_name}.${cluster_domain_suffix} %{ endfor ~} [compute] -%{ for compute in computes ~} -${compute.name} ansible_host=${compute.all_fixed_ips[0]} +%{ for compute in compute_instances ~} +${ compute.name } ansible_host=${[for n in compute.network: n.fixed_ip_v4 if n.access_network][0]} node_fqdn=${ compute.name }.${cluster_name}.${cluster_domain_suffix} %{ endfor ~} # Define groups for slurm parititions: @@ -23,7 +26,7 @@ ${compute.name} ansible_host=${compute.all_fixed_ips[0]} [${cluster_name}_${type_name}] %{~ for node_name, node_type in compute_nodes ~} %{~ if node_type == type_name ~} -${cluster_name}-${node_name} +${ compute_instances[node_name].name } %{~ endif ~} %{~ endfor ~} %{ endfor ~} diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf index dc38ad487..914b5f1e1 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf @@ -124,6 +124,8 @@ resource "openstack_compute_instance_v2" "control" { user_data = <<-EOF #cloud-config + fqdn: ${var.cluster_name}-${each.key}.${var.cluster_name}.${var.cluster_domain_suffix} + fs_setup: - label: state filesystem: ext4 @@ -177,6 +179,11 @@ resource "openstack_compute_instance_v2" "login" { environment_root = var.environment_root } + user_data = <<-EOF + #cloud-config + fqdn: ${var.cluster_name}-${each.key}.${var.cluster_name}.${var.cluster_domain_suffix} + EOF + lifecycle{ ignore_changes = [ image_name, @@ -215,6 +222,11 @@ resource "openstack_compute_instance_v2" "compute" { environment_root = var.environment_root } + user_data = <<-EOF + #cloud-config + fqdn: ${var.cluster_name}-${each.key}.${var.cluster_name}.${var.cluster_domain_suffix} + EOF + lifecycle{ ignore_changes = [ image_name, diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf index b0bfd2366..0804c6f33 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/variables.tf @@ -1,6 +1,12 @@ variable "cluster_name" { type = string - description = "Name for cluster, used as prefix for resources" + description = "Name of cluster, used as part of domain name" +} + +variable "cluster_domain_suffix" { + type = string + description = "Domain suffix for cluster" + default = "invalid" } variable "cluster_net" { diff --git a/requirements.yml b/requirements.yml index 3f65a27ac..4cc3b735b 100644 --- a/requirements.yml +++ b/requirements.yml @@ -6,7 +6,7 @@ roles: version: v0.23.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/165 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git - version: feature/no-install + version: stackhpc name: cloudalchemy.node_exporter - src: https://github.com/cloudalchemy/ansible-prometheus.git version: 4d2c8d742de39e50387e0aa6d5510b21c7451343 # need fix in preceeding commit for rocky @@ -22,9 +22,26 @@ roles: version: v3.0.6 collections: -- name: containers.podman -- name: community.grafana -- name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools - type: git - version: v0.2.0 + - name: containers.podman + version: 1.10.2 + - name: community.grafana + version: 1.5.4 + - name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools + type: git + version: v0.2.0 + - name: ansible.posix + version: 1.5.4 + - name: ansible.netcommon + version: 5.1.1 + - name: community.general + version: 7.1.0 + - name: community.crypto + version: 2.10.0 + - name: community.mysql + version: 3.7.2 + - name: openstack.cloud + version: 2.1.0 + - name: https://github.com/stackhpc/ansible-collection-terraform + type: git + version: 0.1.0 ...