Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Containerise prometheus #308

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,5 @@ roles/*
!roles/persist_hostkeys/
!roles/persist_hostkeys/**
!roles/requirements.yml
!roles/prometheus/
!roles/prometheus/**
53 changes: 3 additions & 50 deletions ansible/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,57 +98,10 @@
vars:
slurm_exporter_state: stopped

- import_role:
name: cloudalchemy.prometheus
tasks_from: preflight.yml

# can't run cloudalchemy.prometheus/tasks/install.yml as it triggers a unit start
# so below is a partial extraction of this:
- name: create prometheus system group
group:
- name: Install containerised Prometheus
include_role:
name: prometheus
system: true
state: present

- name: create prometheus system user
user:
name: prometheus
system: true
shell: "/usr/sbin/nologin"
group: prometheus
createhome: false
home: "{{ prometheus_db_dir }}"

- name: download prometheus binary to local folder
become: false
get_url:
url: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz"
dest: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz"
checksum: "sha256:{{ __prometheus_checksum }}"
register: _download_archive
until: _download_archive is succeeded
retries: 5
delay: 2

- name: unpack prometheus binaries
become: false
unarchive:
remote_src: yes
src: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz"
dest: "/tmp"
creates: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}/prometheus"

- name: propagate official prometheus and promtool binaries
copy:
remote_src: yes
src: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}/{{ item }}"
dest: "{{ _prometheus_binary_install_dir }}/{{ item }}"
mode: 0755
owner: root
group: root
with_items:
- prometheus
- promtool
tasks_from: install.yml

- name: Include distribution variables for cloudalchemy.grafana
include_vars: "{{ appliances_repository_root }}/ansible/roles/cloudalchemy.grafana/vars/redhat.yml"
Expand Down
23 changes: 9 additions & 14 deletions ansible/monitoring.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,22 +67,17 @@

- name: Setup core monitoring software
hosts: prometheus
become: true
tags: prometheus
tasks:
- name: Check for existing prometheus binaries
stat:
path: /usr/local/bin/{{ item }}
register: prometheus_binaries
loop:
- prometheus
- promtool
- name: Skip prometheus install if prometheus binaries exist and prometheus_version not defined
# i.e. if prometheus_version isn't defined we don't care, so use what's already there
set_fact:
prometheus_skip_install: "{{ false if prometheus_version is defined else true }}"
when: "{{ (prometheus_binaries.results | map(attribute='stat') | map(attribute='exists')) + [prometheus_skip_install is not defined] }}"
- import_role:
name: cloudalchemy.prometheus
- name: Install containerised Prometheus
include_role:
name: prometheus
tasks_from: install.yml
- name: Start Prometheus
include_role:
name: prometheus
tasks_from: runtime.yml

- name: Deploy grafana
hosts: grafana
Expand Down
9 changes: 9 additions & 0 deletions ansible/roles/podman/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
podman_users:
- name: "{{ ansible_user }}"
podman_tmp_dir_root: /run # MUST be on a tmpfs

# from azimuth-images/blob/main/ansible/roles/linux-podman/defaults/main.yml
podman_service_type: container
podman_service_wants: []
podman_service_ports: []
podman_service_env: {}
podman_service_volumes: []
podman_service_user: podman
podman_service_group: "{{ podman_service_user }}"
7 changes: 7 additions & 0 deletions ansible/roles/podman/tasks/systemd-unit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---

- name: "Install systemd unit for {{ podman_service_name }}"
template:
src: systemd.{{ podman_service_type }}.service.j2
dest: /etc/systemd/system/{{ podman_service_name }}.service
register: podman_systemd_unit
52 changes: 52 additions & 0 deletions ansible/roles/podman/templates/systemd.container.service.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
[Unit]
Description=Podman container-{{ podman_service_name }}.service
Documentation=man:podman-generate-systemd(1)
Wants=network.target
After=network-online.target
{% if podman_service_pod is defined and podman_service_pod %}
BindsTo={{ podman_service_pod }}.service
PartOf={{ podman_service_pod }}.service
After={{ podman_service_pod }}.service
{% endif %}
{% for service in podman_service_wants %}
Wants={{ service }}.service
After={{ service }}.service
{% endfor %}

[Service]
Environment=PODMAN_SYSTEMD_UNIT=%n
Type=simple
Restart=always
User={{ podman_service_user }}
Group={{ podman_service_group }}
ExecStart=/usr/bin/podman run \
--cgroups=no-conmon \
--replace \
--restart=no \
--name {{ podman_service_name }} \
{% if podman_service_network is defined %}
--network {{ podman_service_network }} \
{% endif %}
{% if podman_service_pod is defined and podman_service_pod %}
--pod {{ podman_service_pod }} \
{% endif %}
{% for port in podman_service_ports %}
--publish {{ port }} \
{% endfor %}
{% for name, value in podman_service_env.items() %}
--env {{ (name + "=" + value) | quote }} \
{% endfor %}
{% for volume in podman_service_volumes %}
--volume {{ volume }} \
{% endfor %}
{% if podman_service_command is defined %}
{{ podman_service_image }} \
{{ podman_service_command }}
{% else %}
{{ podman_service_image }}
{% endif %}
ExecStop=/usr/bin/podman stop --ignore -t 10 {{ podman_service_name }}
ExecStopPost=/usr/bin/podman rm --ignore -f {{ podman_service_name }}

[Install]
WantedBy=multi-user.target default.target
12 changes: 12 additions & 0 deletions ansible/roles/prometheus/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# prometheus
Creates a systemd service `prometheus` which uses the `podman` user to run a containerised [Prometheus](https://github.com/prometheus/prometheus) monitoring system.

Note this contains two task books:
- `install.yml`: This is safe to run during a Packer build. It creates the systemd unit file.
- `runtime.yml`: This cannot be run during a Packer build. It templates out config and restarts/starts the service as required.

## Role Variables

See `defaults/main.yml`. All variables can be updated by running `runtime.yml`, except the below which require `install.yml` to be run to update:
- `prometheus_storage_retention_size`
- `prometheus_storage_retention`
26 changes: 26 additions & 0 deletions ansible/roles/prometheus/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
prometheus_container_image: "prom/prometheus"
prometheus_version: "v2.48.1"

# Variables below are compatible with the deprecated, non-containerised [cloudalchemy.prometheus](https://github.com/cloudalchemy/ansible-prometheus) role
prometheus_alert_rules: []
prometheus_alert_rules_files:
- prometheus/rules/*.rules
prometheus_config_dir: /etc/prometheus
prometheus_db_dir: /var/lib/prometheus
prometheus_external_labels:
environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}"
prometheus_global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 15s
prometheus_remote_read: []
prometheus_remote_write: []
prometheus_scrape_configs: {}
prometheus_storage_retention_size: "0GB"
prometheus_storage_retention: "30d"
prometheus_targets: []
# See https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md
prometheus_web_config:
tls_server_config: {}
http_server_config: {}
basic_auth_users: {}
5 changes: 5 additions & 0 deletions ansible/roles/prometheus/handlers/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- name: Restart Prometheus
systemd:
name: prometheus
enabled: true
state: restarted
25 changes: 25 additions & 0 deletions ansible/roles/prometheus/tasks/install.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---

- name: Install systemd unit for Prometheus
include_role:
name: podman
tasks_from: systemd-unit.yml
vars:
podman_service_name: prometheus
podman_service_type: container
podman_service_image: "{{ prometheus_container_image }}:{{ prometheus_version }}"
podman_service_volumes:
- "{{ prometheus_config_dir }}:/etc/prometheus/:U,ro"
- "{{ prometheus_db_dir }}:/prometheus:U"
podman_service_command: |
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/prometheus \
--web.console.libraries=/usr/share/prometheus/console_libraries \
--web.console.templates=/usr/share/prometheus/consoles \
--storage.tsdb.retention.size={{ prometheus_storage_retention_size }} \
--storage.tsdb.retention.time={{ prometheus_storage_retention }}
podman_service_network: host

- name: Reload Prometheus unit file
command: systemctl daemon-reload
when: podman_systemd_unit.changed
83 changes: 83 additions & 0 deletions ansible/roles/prometheus/tasks/runtime.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
---
# Based on cloudalchemy.prometheus/tasks/configure.yml so to support same configurations
# Main changes for containerisation are:
# - user/group of files should be podman
# - cannot reload service

- name: Ensure Prometheus host directories exists
file:
path: "{{ item }}"
state: directory
owner: podman
group: podman
mode: ug=rwX,o=
recurse: true
loop:
- "{{ prometheus_config_dir }}"
- "{{ prometheus_config_dir }}/rules"
- "{{ prometheus_config_dir }}/file_sd"
- "{{ prometheus_db_dir }}" # this should be on a volume, so can't do during install==image build

- name: Configure alerting rules file
template:
src: "alert.rules.j2"
dest: "{{ prometheus_config_dir }}/rules/ansible_managed.rules"
owner: podman
group: podman
mode: 0644
# validate: "{{ _prometheus_binary_install_dir }}/promtool check rules %s" # TODO: is in container, could use it
when: prometheus_alert_rules != []
notify: Restart Prometheus

- name: Copy custom alerting rule files
copy:
src: "{{ item }}"
dest: "{{ prometheus_config_dir }}/rules/"
owner: podman
group: podman
mode: 0644
# validate: "{{ _prometheus_binary_install_dir }}/promtool check rules %s" # see above
with_fileglob: "{{ prometheus_alert_rules_files }}"
notify: Restart Prometheus

- name: Configure Prometheus
template:
src: prometheus.yml.j2
dest: "{{ prometheus_config_dir }}/prometheus.yml"
#validate: "{{ _prometheus_binary_install_dir }}/promtool check config %s" see above
notify: Restart Prometheus

- name: Configure Prometheus web
copy:
content: "{{ prometheus_web_config | to_nice_yaml(indent=2) }}"
dest: "{{ prometheus_config_dir }}/web.yml"
owner: podman
group: podman
mode: 0644

- name: Configure prometheus static targets
copy:
content: |
#jinja2: lstrip_blocks: True
{{ item.value | to_nice_yaml(indent=2) }}
dest: "{{ prometheus_config_dir }}/file_sd/{{ item.key }}.yml"
force: true
owner: podman
group: podman
mode: 0644
with_dict: "{{ prometheus_targets }}"
when: prometheus_targets != {}

- name: Pull image for Prometheus
containers.podman.podman_image:
name: "{{ prometheus_container_image }}:{{ prometheus_version }}"
become_user: podman
notify: Restart Prometheus

- meta: flush_handlers # handles restarts due to config changes

- name: Ensure Prometheus service state
systemd:
name: prometheus
enabled: true
state: "{{ 'restarted' if (podman_systemd_unit.changed | default(false)) else 'started' }}"
34 changes: 34 additions & 0 deletions ansible/roles/prometheus/templates/prometheus.yml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#jinja2: trim_blocks: True, lstrip_blocks: True
{{ ansible_managed | comment }}
# http://prometheus.io/docs/operating/configuration/

global:
{{ prometheus_global | to_nice_yaml(indent=2) | indent(2, False) }}
external_labels:
{{ prometheus_external_labels | to_nice_yaml(indent=2) | indent(4, False) }}

{% if prometheus_remote_write != [] %}
remote_write:
{{ prometheus_remote_write | to_nice_yaml(indent=2) | indent(2, False) }}
{% endif %}

{% if prometheus_remote_read != [] %}
remote_read:
{{ prometheus_remote_read | to_nice_yaml(indent=2) | indent(2, False) }}
{% endif %}

rule_files:
- {{ prometheus_config_dir }}/rules/*.rules

{% if prometheus_alertmanager_config | length > 0 %}
alerting:
alertmanagers:
{{ prometheus_alertmanager_config | to_nice_yaml(indent=2) | indent(2,False) }}
{% if prometheus_alert_relabel_configs | length > 0 %}
alert_relabel_configs:
{{ prometheus_alert_relabel_configs | to_nice_yaml(indent=2) | indent(2,False) }}
{% endif %}
{% endif %}

scrape_configs:
{{ prometheus_scrape_configs | to_nice_yaml(indent=2) | indent(2,False) }}
2 changes: 1 addition & 1 deletion environments/.stackhpc/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ variable "cluster_name" {
variable "cluster_image" {
description = "single image for all cluster nodes - a convenience for CI"
type = string
default = "openhpc-240102-1025-e533fd70" # https://github.com/stackhpc/ansible-slurm-appliance/pull/346
default = "openhpc-240105-1605-55370e49" # https://github.com/stackhpc/ansible-slurm-appliance/pull/308
# default = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2"
}

Expand Down
10 changes: 0 additions & 10 deletions environments/common/inventory/group_vars/all/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,6 @@ appliances_local_users_default:
uid: 202
system: true

- group:
name: prometheus
gid: 976
user:
name: prometheus
uid: 981
home: "{{ prometheus_db_dir }}"
shell: /usr/sbin/nologin
enable: "{{ 'prometheus' in group_names }}"

- group:
name: grafana
gid: 979
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# See: https://github.com/cloudalchemy/ansible-prometheus
# for variable definitions

prometheus_version: 2.27.0 # default from ansible/roles/cloudalchemy.prometheus/defaults/main.yml
prometheus_web_external_url: "http://{{ prometheus_address }}:9090"
prometheus_storage_retention: "31d"
prometheus_storage_retention_size: "100GB"
Expand Down
Loading
Loading