Skip to content

Commit

Permalink
fix(o11y): add templates and code for docker
Browse files Browse the repository at this point in the history
  • Loading branch information
raisedadead committed Feb 11, 2025
1 parent e3ade52 commit cd8c0a1
Show file tree
Hide file tree
Showing 5 changed files with 196 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,19 @@
- loki_push_endpoint != ""
- loki_username != ""
- loki_password != ""
fail_msg: "One or more required environment variables are not set. Please set all required variables."
fail_msg:
"One or more required environment variables are not set. Please set
all required variables."

- name: Install Alloy
ansible.builtin.include_role:
name: grafana.grafana.alloy
vars:
# --- DO NOT CHANGE ---
# Do not change 'config' to 'alloy_config' if the linter complains,
# You can ignore it, since we are using a template to generate the config.
config: "{{ lookup('template', 'grafana-labs/config.alloy.j2') }}"
# --- DO NOT CHANGE ---

- name: Add the Alloy user to the adm group
ansible.builtin.user:
Expand Down
95 changes: 95 additions & 0 deletions ansible/play-o11y--grafana-cloud-1-docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
---
- name: Grafana Cloud Setup
hosts: '{{ variable_host | default("null") }}'
become: true
gather_facts: true

vars:
prometheus_push_endpoint: "{{ lookup('env', 'GRAFANA_CLOUD_URL_Prometheus') }}"
prometheus_username: "{{ lookup('env', 'GRAFANA_CLOUD_USERNAME_Prometheus') }}"
prometheus_password: "{{ lookup('env', 'GRAFANA_CLOUD_API_KEY_Prometheus') }}"
loki_push_endpoint: "{{ lookup('env', 'GRAFANA_CLOUD_URL_Loki') }}"
loki_username: "{{ lookup('env', 'GRAFANA_CLOUD_USERNAME_Loki') }}"
loki_password: "{{ lookup('env', 'GRAFANA_CLOUD_API_KEY_Loki') }}"
is_enabled_metrics: "{{ variable_is_enabled_metrics | default(True) }}"
is_enabled_logs: "{{ variable_is_enabled_logs | default(False) }}"

tasks:
- name: Check if required environment variables are set
ansible.builtin.assert:
that:
- prometheus_push_endpoint != ""
- prometheus_username != ""
- prometheus_password != ""
- loki_push_endpoint != ""
- loki_username != ""
- loki_password != ""
fail_msg:
"One or more required environment variables are not set. Please set
all required variables."

- name: Check if Alloy is installed
ansible.builtin.command: which alloy
register: alloy_check
ignore_errors: true
changed_when: false

- name: Check if Alloy service is enabled
ansible.builtin.command: systemctl is-enabled alloy
register: alloy_service_check
ignore_errors: true
changed_when: false

- name: Check if Docker is installed
ansible.builtin.command: which docker
register: docker_check
ignore_errors: true
changed_when: false

- name: Exit if checks fail
ansible.builtin.fail:
msg: "Checks failed. Please check the logs for more information."
when: docker_check.rc != 0 or alloy_check.rc != 0 or alloy_service_check.rc != 0

- name: Setup Docker integration
when: docker_check.rc == 0 and alloy_check.rc == 0 and alloy_service_check.rc == 0
block:
- name: Append Docker config to Alloy config file
when: is_enabled_metrics | bool
ansible.builtin.blockinfile:
path: /etc/alloy/config.alloy
prepend_newline: true
marker: "// {mark} ANSIBLE MANAGED BLOCK -- Docker Integration - Metrics"
block: |
{{ lookup('template', 'grafana-labs/docker.metrics.config.alloy.j2') }}
- name: Append Docker logs config to Alloy config file
when: is_enabled_logs | bool
ansible.builtin.blockinfile:
path: /etc/alloy/config.alloy
prepend_newline: true
marker: "// {mark} ANSIBLE MANAGED BLOCK -- Docker Integration - Logs"
block: |
{{ lookup('template', 'grafana-labs/docker.logs.config.alloy.j2') }}
- name: Add alloy user to docker group
ansible.builtin.user:
name: alloy
groups: docker
append: true

- name: Change the user to root in Alloy service user
ansible.builtin.lineinfile:
path: /etc/systemd/system/alloy.service
regexp: "^User=alloy"
line: "User=root"
state: present

- name: Reload the systemd daemon
ansible.builtin.systemd:
daemon_reload: true

- name: Restart the Alloy service
ansible.builtin.systemd:
name: alloy
state: restarted
56 changes: 28 additions & 28 deletions ansible/templates/grafana-labs/config.alloy.j2
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
prometheus.remote_write "metrics_service" {
endpoint {
url = "{{ prometheus_push_endpoint }}"

basic_auth {
username = "{{ prometheus_username }}"
password = "{{ prometheus_password }}"
}
}
}

loki.write "grafana_cloud_loki" {
endpoint {
url = "{{ loki_push_endpoint }}"

basic_auth {
username = "{{ loki_username }}"
password = "{{ loki_password }}"
}
}
}

prometheus.exporter.self "integrations_alloy_health" { }

discovery.relabel "integrations_alloy_health" {
Expand All @@ -21,13 +43,13 @@ prometheus.scrape "integrations_alloy_health" {
}

prometheus.relabel "integrations_alloy_health" {
forward_to = [prometheus.remote_write.metrics_service.receiver]
forward_to = [prometheus.remote_write.metrics_service.receiver]

rule {
source_labels = ["__name__"]
regex = "alloy_build_info|alloy_component_controller_evaluating|alloy_component_controller_running_components|alloy_component_dependencies_wait_seconds|alloy_component_dependencies_wait_seconds_bucket|alloy_component_evaluation_seconds|alloy_component_evaluation_seconds_bucket|alloy_component_evaluation_seconds_count|alloy_component_evaluation_seconds_sum|alloy_component_evaluation_slow_seconds|alloy_config_hash|alloy_resources_machine_rx_bytes_total|alloy_resources_machine_tx_bytes_total|alloy_resources_process_cpu_seconds_total|alloy_resources_process_resident_memory_bytes|cluster_node_gossip_health_score|cluster_node_gossip_proto_version|cluster_node_gossip_received_events_total|cluster_node_info|cluster_node_lamport_time|cluster_node_peers|cluster_node_update_observers|cluster_transport_rx_bytes_total|cluster_transport_rx_packet_queue_length|cluster_transport_rx_packets_failed_total|cluster_transport_rx_packets_total|cluster_transport_stream_rx_bytes_total|cluster_transport_stream_rx_packets_failed_total|cluster_transport_stream_rx_packets_total|cluster_transport_stream_tx_bytes_total|cluster_transport_stream_tx_packets_failed_total|cluster_transport_stream_tx_packets_total|cluster_transport_streams|cluster_transport_tx_bytes_total|cluster_transport_tx_packet_queue_length|cluster_transport_tx_packets_failed_total|cluster_transport_tx_packets_total|exporter_send_failed_spans_ratio_total|exporter_sent_spans_ratio_total|go_gc_duration_seconds_count|go_goroutines|go_memstats_heap_inuse_bytes|processor_batch_batch_send_size_ratio_bucket|processor_batch_metadata_cardinality_ratio|processor_batch_timeout_trigger_send_ratio_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_highest_timestamp_in_seconds|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_queue_highest_sent_timestamp_seconds|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_retried_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_sent_batch_duration_seconds_bucket|prometheus_remote_storage_sent_batch_duration_seconds_count|prometheus_remote_storage_sent_batch_duration_seconds_sum|prometheus_remote_storage_shards|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_remote_write_wal_samples_appended_total|prometheus_remote_write_wal_storage_active_series|receiver_accepted_spans_ratio_total|receiver_refused_spans_ratio_total|rpc_server_duration_milliseconds_bucket|scrape_duration_seconds|up"
action = "keep"
}
rule {
source_labels = ["__name__"]
regex = "alloy_build_info|alloy_component_controller_evaluating|alloy_component_controller_running_components|alloy_component_dependencies_wait_seconds|alloy_component_dependencies_wait_seconds_bucket|alloy_component_evaluation_seconds|alloy_component_evaluation_seconds_bucket|alloy_component_evaluation_seconds_count|alloy_component_evaluation_seconds_sum|alloy_component_evaluation_slow_seconds|alloy_config_hash|alloy_resources_machine_rx_bytes_total|alloy_resources_machine_tx_bytes_total|alloy_resources_process_cpu_seconds_total|alloy_resources_process_resident_memory_bytes|cluster_node_gossip_health_score|cluster_node_gossip_proto_version|cluster_node_gossip_received_events_total|cluster_node_info|cluster_node_lamport_time|cluster_node_peers|cluster_node_update_observers|cluster_transport_rx_bytes_total|cluster_transport_rx_packet_queue_length|cluster_transport_rx_packets_failed_total|cluster_transport_rx_packets_total|cluster_transport_stream_rx_bytes_total|cluster_transport_stream_rx_packets_failed_total|cluster_transport_stream_rx_packets_total|cluster_transport_stream_tx_bytes_total|cluster_transport_stream_tx_packets_failed_total|cluster_transport_stream_tx_packets_total|cluster_transport_streams|cluster_transport_tx_bytes_total|cluster_transport_tx_packet_queue_length|cluster_transport_tx_packets_failed_total|cluster_transport_tx_packets_total|exporter_send_failed_spans_ratio_total|exporter_sent_spans_ratio_total|go_gc_duration_seconds_count|go_goroutines|go_memstats_heap_inuse_bytes|processor_batch_batch_send_size_ratio_bucket|processor_batch_metadata_cardinality_ratio|processor_batch_timeout_trigger_send_ratio_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_highest_timestamp_in_seconds|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_queue_highest_sent_timestamp_seconds|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_retried_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_sent_batch_duration_seconds_bucket|prometheus_remote_storage_sent_batch_duration_seconds_count|prometheus_remote_storage_sent_batch_duration_seconds_sum|prometheus_remote_storage_shards|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_remote_write_wal_samples_appended_total|prometheus_remote_write_wal_storage_active_series|receiver_accepted_spans_ratio_total|receiver_refused_spans_ratio_total|rpc_server_duration_milliseconds_bucket|scrape_duration_seconds|up"
action = "keep"
}
}

logging {
Expand Down Expand Up @@ -63,28 +85,6 @@ loki.relabel "logs_integrations_integrations_alloy_health" {
}
}

prometheus.remote_write "metrics_service" {
endpoint {
url = "{{ prometheus_push_endpoint }}"

basic_auth {
username = "{{ prometheus_username }}"
password = "{{ prometheus_password }}"
}
}
}

loki.write "grafana_cloud_loki" {
endpoint {
url = "{{ loki_push_endpoint }}"

basic_auth {
username = "{{ loki_username }}"
password = "{{ loki_password }}"
}
}
}

discovery.relabel "integrations_node_exporter" {
targets = prometheus.exporter.unix.integrations_node_exporter.targets

Expand Down
35 changes: 35 additions & 0 deletions ansible/templates/grafana-labs/docker.logs.config.alloy.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
discovery.docker "logs_integrations_docker" {
host = "unix:///var/run/docker.sock"
refresh_interval = "5s"
}
discovery.relabel "logs_integrations_docker" {
targets = []

rule {
target_label = "job"
replacement = "integrations/docker"
}

rule {
target_label = "instance"
replacement = constants.hostname
}

rule {
source_labels = ["__meta_docker_container_name"]
regex = "/(.*)"
target_label = "container"
}

rule {
source_labels = ["__meta_docker_container_log_stream"]
target_label = "stream"
}
}
loki.source.docker "logs_integrations_docker" {
host = "unix:///var/run/docker.sock"
targets = discovery.docker.logs_integrations_docker.targets
forward_to = [loki.write.grafana_cloud_loki.receiver]
relabel_rules = discovery.relabel.logs_integrations_docker.rules
refresh_interval = "5s"
}
31 changes: 31 additions & 0 deletions ansible/templates/grafana-labs/docker.metrics.config.alloy.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
prometheus.exporter.cadvisor "integrations_cadvisor" {
docker_only = true
}
discovery.relabel "integrations_cadvisor" {
targets = prometheus.exporter.cadvisor.integrations_cadvisor.targets

rule {
target_label = "job"
replacement = "integrations/docker"
}

rule {
target_label = "instance"
replacement = constants.hostname
}
}

prometheus.relabel "integrations_cadvisor" {
forward_to = [prometheus.remote_write.metrics_service.receiver]

rule {
source_labels = ["__name__"]
regex = "up|container_cpu_usage_seconds_total|container_fs_inodes_free|container_fs_inodes_total|container_fs_limit_bytes|container_fs_usage_bytes|container_last_seen|container_memory_usage_bytes|container_network_receive_bytes_total|container_network_tcp_usage_total|container_network_transmit_bytes_total|container_spec_memory_reservation_limit_bytes|machine_memory_bytes|machine_scrape_error"
action = "keep"
}
}

prometheus.scrape "integrations_cadvisor" {
targets = discovery.relabel.integrations_cadvisor.output
forward_to = [prometheus.relabel.integrations_cadvisor.receiver]
}

0 comments on commit cd8c0a1

Please sign in to comment.