Skip to content

Commit 5b01b9a

Browse files
author
Stefano Torresi
authored
Merge pull request #123 from stefanotorresi/refactor-metrics
Refactor metrics
2 parents 320186b + 79541e7 commit 5b01b9a

File tree

11 files changed

+113
-173
lines changed

11 files changed

+113
-173
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ go get github.com/ClusterLabs/ha_cluster_exporter
4848
```
4949

5050
### RPM
51-
You can find the repositories for RPM based distributions in [SUSE's Open Build Service](https://build.opensuse.org/repositories/server:monitoring/prometheus-ha_cluster_exporter).
51+
You can find the repositories for RPM based distributions in [SUSE's Open Build Service](https://build.opensuse.org/package/show/server:monitoring/prometheus-ha_cluster_exporter).
5252
On openSUSE or SUSE Linux Enterprise you can just use the `zypper` system package manager:
5353
```shell
5454
export DISTRO=SLE_15_SP1 # change as desired

corosync_metrics.go

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,24 @@ import (
1212
log "github.com/sirupsen/logrus"
1313
)
1414

15-
var (
16-
corosyncMetrics = metricDescriptors{
17-
// the map key will function as an identifier of the metric throughout the rest of the code;
18-
// it is arbitrary, but by convention we use the actual metric name
19-
"quorate": NewMetricDesc("corosync", "quorate", "Whether or not the cluster is quorate", nil),
20-
"ring_errors_total": NewMetricDesc("corosync", "ring_errors_total", "Total number of corosync ring errors", nil),
21-
"quorum_votes": NewMetricDesc("corosync", "quorum_votes", "Cluster quorum votes; one line per type", []string{"type"}),
22-
}
23-
)
24-
2515
func NewCorosyncCollector(cfgToolPath string, quorumToolPath string) (*corosyncCollector, error) {
2616
err := CheckExecutables(cfgToolPath, quorumToolPath)
2717
if err != nil {
2818
return nil, errors.Wrap(err, "could not initialize Corosync collector")
2919
}
3020

31-
return &corosyncCollector{
21+
collector := &corosyncCollector{
3222
DefaultCollector{
33-
metrics: corosyncMetrics,
23+
subsystem: "corosync",
3424
},
3525
cfgToolPath,
3626
quorumToolPath,
37-
}, nil
27+
}
28+
collector.setDescriptor("quorate", "Whether or not the cluster is quorate", nil)
29+
collector.setDescriptor("ring_errors", "The number of corosync ring errors", nil)
30+
collector.setDescriptor("quorum_votes", "Cluster quorum votes; one line per type", []string{"type"})
31+
32+
return collector, nil
3833
}
3934

4035
type corosyncCollector struct {
@@ -72,7 +67,7 @@ func (c *corosyncCollector) collectRingErrorsTotal(ch chan<- prometheus.Metric)
7267
return errors.Wrap(err, "cannot parse ring status")
7368
}
7469

75-
ch <- c.makeGaugeMetric("ring_errors_total", float64(ringErrorsTotal))
70+
ch <- c.makeGaugeMetric("ring_errors", float64(ringErrorsTotal))
7671

7772
return nil
7873
}

doc/metrics.md

Lines changed: 15 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,12 @@ The Pacemaker subsystem collects an atomic snapshot of the HA cluster directly f
2525

2626
0. [Sample](../test/pacemaker.metrics)
2727
1. [`ha_cluster_pacemaker_config_last_change`](#ha_cluster_pacemaker_config_last_change)
28-
3. [`ha_cluster_pacemaker_fail_count`](#ha_cluster_pacemaker_fail_count)
29-
2. [`ha_cluster_pacemaker_location_constraints`](#ha_cluster_pacemaker_location_constraints)
28+
2. [`ha_cluster_pacemaker_fail_count`](#ha_cluster_pacemaker_fail_count)
29+
3. [`ha_cluster_pacemaker_location_constraints`](#ha_cluster_pacemaker_location_constraints)
3030
4. [`ha_cluster_pacemaker_migration_threshold`](#ha_cluster_pacemaker_migration_threshold)
31-
5. [`ha_cluster_pacemaker_nodes_total`](#ha_cluster_pacemaker_nodes_total)
32-
6. [`ha_cluster_pacemaker_nodes`](#ha_cluster_pacemaker_nodes)
33-
7. [`ha_cluster_pacemaker_resources_total`](#ha_cluster_pacemaker_resources_total)
34-
8. [`ha_cluster_pacemaker_resources`](#ha_cluster_pacemaker_resources)
35-
9. [`ha_cluster_pacemaker_stonith_enabled`](#ha_cluster_pacemaker_stonith_enabled)
31+
5. [`ha_cluster_pacemaker_nodes`](#ha_cluster_pacemaker_nodes)
32+
6. [`ha_cluster_pacemaker_resources`](#ha_cluster_pacemaker_resources)
33+
7. [`ha_cluster_pacemaker_stonith_enabled`](#ha_cluster_pacemaker_stonith_enabled)
3634

3735

3836
### `ha_cluster_pacemaker_config_last_change`
@@ -92,13 +90,6 @@ Either the value is `1`, or the line is absent altogether.
9290
The total number of lines for this metric will be the cardinality of `name` times the cardinality of `status`.
9391

9492

95-
### `ha_cluster_pacemaker_nodes_total`
96-
97-
#### Description
98-
99-
The total number of *configured* nodes in the cluster. This value is mostly static and *does not* take into account the status of the nodes. It only changes when the Pacemaker configuration changes.
100-
101-
10293
### `ha_cluster_pacemaker_resources`
10394

10495
#### Description
@@ -117,13 +108,6 @@ Either the value is `1`, or the line is absent altogether.
117108
The total number of lines for this metric will be the cardinality of `id` times the cardinality of `status`.
118109

119110

120-
### `ha_cluster_pacemaker_resources_total`
121-
122-
#### Description
123-
124-
The total number of *configured* resources in the cluster. This value is mostly static and *does not* take into account the status of the resources. It only changes when the Pacemaker configuration changes.
125-
126-
127111
### `ha_cluster_pacemaker_stonith_enabled`
128112

129113
#### Description
@@ -139,7 +123,7 @@ The Corosync subsystem collects cluster quorum votes and ring status by parsing
139123
0. [Sample](../test/corosync.metrics)
140124
1. [`ha_cluster_corosync_quorate`](#ha_cluster_corosync_quorate)
141125
2. [`ha_cluster_corosync_quorum_votes`](#ha_cluster_corosync_quorum_votes)
142-
3. [`ha_cluster_corosync_ring_errors_total`](#ha_cluster_corosync_ring_errors_total)
126+
3. [`ha_cluster_corosync_ring_errors`](#ha_cluster_corosync_ring_errors)
143127

144128

145129
### `ha_cluster_corosync_quorate`
@@ -161,44 +145,35 @@ Cluster quorum votes; one line per type.
161145
- `type`: one of `expected_votes|highest_expected|total_votes|quorum`
162146

163147

164-
### `ha_cluster_corosync_ring_errors_total`
148+
### `ha_cluster_corosync_ring_errors`
165149

166150
#### Description
167151

168-
Total number of corosync ring errors.
152+
The number of corosync ring errors.
169153

170154

171155
## SBD
172156

173-
The SBD subsystems collect devices stats by parsing its configuration the output of `sbd --dump`.
157+
The SBD subsystems collect devices stats by parsing its configuration and the output of `sbd --dump`.
174158

175159
0. [Sample](../test/sbd.metrics)
176-
1. [`ha_cluster_sbd_device_status`](#ha_cluster_sbd_device_status)
177-
2. [`ha_cluster_sbd_devices_total`](#ha_cluster_sbd_devices_total)
160+
2. [`ha_cluster_sbd_devices`](#ha_cluster_sbd_devices)
178161

179-
180-
### `ha_cluster_sbd_device_status`
162+
### `ha_cluster_sbd_devices`
181163

182164
#### Description
183165

184-
Whether or not an SBD device is healthy. One line per `device`.
185-
Value is either `1` or `0`.
166+
The SBD devices in the cluster; one line per device.
167+
Either the value is `1`, or the line is absent altogether.
186168

187169
#### Labels
188170

189-
- `device`: the path of the device.
171+
- `device`: the path of the SBD device
172+
- `status`: one of `healthy|unhealthy`
190173

191174
The total number of lines for this metric will be the cardinality of `device`.
192175

193176

194-
### `ha_cluster_sbd_devices_total`
195-
196-
#### Description
197-
198-
Total count of configured SBD devices.
199-
Value is an integer greater than or equal to `0`.
200-
201-
202177
## DRBD
203178

204179
The DRBD subsystems collect devices stats by parsing its configuration the JSON output of `drbdsetup`.

drbd_metrics.go

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -43,41 +43,37 @@ type drbdStatus struct {
4343
} `json:"connections"`
4444
}
4545

46-
var (
47-
drbdMetrics = metricDescriptors{
48-
// the map key will function as an identifier of the metric throughout the rest of the code;
49-
// it is arbitrary, but by convention we use the actual metric name
50-
"resources": NewMetricDesc("drbd", "resources", "The DRBD resources; 1 line per name, per volume", []string{"resource", "role", "volume", "disk_state"}),
51-
"written": NewMetricDesc("drbd", "written", "KiB written to DRBD; 1 line per res, per volume", []string{"resource", "volume"}),
52-
"read": NewMetricDesc("drbd", "read", "KiB read from DRBD; 1 line per res, per volume", []string{"resource", "volume"}),
53-
"al_writes": NewMetricDesc("drbd", "al_writes", "Writes to activity log; 1 line per res, per volume", []string{"resource", "volume"}),
54-
"bm_writes": NewMetricDesc("drbd", "bm_writes", "Writes to bitmap; 1 line per res, per volume", []string{"resource", "volume"}),
55-
"upper_pending": NewMetricDesc("drbd", "upper_pending", "Upper pending; 1 line per res, per volume", []string{"resource", "volume"}),
56-
"lower_pending": NewMetricDesc("drbd", "lower_pending", "Lower pending; 1 line per res, per volume", []string{"resource", "volume"}),
57-
"quorum": NewMetricDesc("drbd", "quorum", "Quorum status per resource and per volume", []string{"resource", "volume"}),
58-
"connections": NewMetricDesc("drbd", "connections", "The DRBD resource connections; 1 line per per resource, per peer_node_id", []string{"resource", "peer_node_id", "peer_role", "volume", "peer_disk_state"}),
59-
"connections_sync": NewMetricDesc("drbd", "connections_sync", "The in sync percentage value for DRBD resource connections", []string{"resource", "peer_node_id", "volume"}),
60-
"connections_received": NewMetricDesc("drbd", "connections_received", "KiB received per connection", []string{"resource", "peer_node_id", "volume"}),
61-
"connections_sent": NewMetricDesc("drbd", "connections_sent", "KiB sent per connection", []string{"resource", "peer_node_id", "volume"}),
62-
"connections_pending": NewMetricDesc("drbd", "connections_pending", "Pending value per connection", []string{"resource", "peer_node_id", "volume"}),
63-
"connections_unacked": NewMetricDesc("drbd", "connections_unacked", "Unacked value per connection", []string{"resource", "peer_node_id", "volume"}),
64-
"split_brain": NewMetricDesc("drbd", "split_brain", "Whether a split brain has been detected; 1 line per resource, per volume.", []string{"resource", "volume"}),
65-
}
66-
)
67-
6846
func NewDrbdCollector(drbdSetupPath string, drbdSplitBrainPath string) (*drbdCollector, error) {
6947
err := CheckExecutables(drbdSetupPath)
7048
if err != nil {
7149
return nil, errors.Wrap(err, "could not initialize DRBD collector")
7250
}
7351

74-
return &drbdCollector{
52+
collector := &drbdCollector{
7553
DefaultCollector{
76-
metrics: drbdMetrics,
54+
subsystem: "drbd",
7755
},
7856
drbdSetupPath,
7957
drbdSplitBrainPath,
80-
}, nil
58+
}
59+
60+
collector.setDescriptor("resources", "The DRBD resources; 1 line per name, per volume", []string{"resource", "role", "volume", "disk_state"})
61+
collector.setDescriptor("written", "KiB written to DRBD; 1 line per res, per volume", []string{"resource", "volume"})
62+
collector.setDescriptor("read", "KiB read from DRBD; 1 line per res, per volume", []string{"resource", "volume"})
63+
collector.setDescriptor("al_writes", "Writes to activity log; 1 line per res, per volume", []string{"resource", "volume"})
64+
collector.setDescriptor("bm_writes", "Writes to bitmap; 1 line per res, per volume", []string{"resource", "volume"})
65+
collector.setDescriptor("upper_pending", "Upper pending; 1 line per res, per volume", []string{"resource", "volume"})
66+
collector.setDescriptor("lower_pending", "Lower pending; 1 line per res, per volume", []string{"resource", "volume"})
67+
collector.setDescriptor("quorum", "Quorum status per resource and per volume", []string{"resource", "volume"})
68+
collector.setDescriptor("connections", "The DRBD resource connections; 1 line per per resource, per peer_node_id", []string{"resource", "peer_node_id", "peer_role", "volume", "peer_disk_state"})
69+
collector.setDescriptor("connections_sync", "The in sync percentage value for DRBD resource connections", []string{"resource", "peer_node_id", "volume"})
70+
collector.setDescriptor("connections_received", "KiB received per connection", []string{"resource", "peer_node_id", "volume"})
71+
collector.setDescriptor("connections_sent", "KiB sent per connection", []string{"resource", "peer_node_id", "volume"})
72+
collector.setDescriptor("connections_pending", "Pending value per connection", []string{"resource", "peer_node_id", "volume"})
73+
collector.setDescriptor("connections_unacked", "Unacked value per connection", []string{"resource", "peer_node_id", "volume"})
74+
collector.setDescriptor("split_brain", "Whether a split brain has been detected; 1 line per resource, per volume.", []string{"resource", "volume"})
75+
76+
return collector, nil
8177
}
8278

8379
type drbdCollector struct {

ha_cluster_exporter.go

Lines changed: 31 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -26,49 +26,55 @@ func (SystemClock) Now() time.Time {
2626
return time.Now()
2727
}
2828

29-
type metricDescriptors map[string]*prometheus.Desc
30-
3129
type DefaultCollector struct {
32-
metrics metricDescriptors
30+
subsystem string
31+
descriptors map[string]*prometheus.Desc
32+
}
33+
34+
func (c *DefaultCollector) getDescriptor(name string) *prometheus.Desc {
35+
desc, ok := c.descriptors[name]
36+
if !ok {
37+
// we hard panic on this because it's most certainly a coding error
38+
panic(errors.Errorf("undeclared metric '%s'", name))
39+
}
40+
return desc
41+
}
42+
43+
// Convenience wrapper around prometheus.NewDesc constructor.
44+
// Stores a metric descriptor with a fully qualified name like `NAMESPACE_subsystem_name`.
45+
// `name` is the last and most relevant part of the metrics Full Qualified Name;
46+
// `help` is the message displayed in the HELP line
47+
// `variableLabels` is a list of labels to declare. Use `nil` to declare no labels.
48+
func (c *DefaultCollector) setDescriptor(name, help string, variableLabels []string) {
49+
if c.descriptors == nil {
50+
c.descriptors = make(map[string]*prometheus.Desc)
51+
}
52+
c.descriptors[name] = prometheus.NewDesc(prometheus.BuildFQName(NAMESPACE, c.subsystem, name), help, variableLabels, nil)
3353
}
3454

3555
func (c *DefaultCollector) Describe(ch chan<- *prometheus.Desc) {
36-
for _, metric := range c.metrics {
37-
ch <- metric
56+
for _, descriptor := range c.descriptors {
57+
ch <- descriptor
3858
}
3959
}
4060

41-
func (c *DefaultCollector) makeGaugeMetric(metricKey string, value float64, labelValues ...string) prometheus.Metric {
42-
return c.makeMetric(metricKey, value, prometheus.GaugeValue, labelValues...)
61+
func (c *DefaultCollector) makeGaugeMetric(name string, value float64, labelValues ...string) prometheus.Metric {
62+
return c.makeMetric(name, value, prometheus.GaugeValue, labelValues...)
4363
}
4464

45-
func (c *DefaultCollector) makeCounterMetric(metricKey string, value float64, labelValues ...string) prometheus.Metric {
46-
return c.makeMetric(metricKey, value, prometheus.CounterValue, labelValues...)
65+
func (c *DefaultCollector) makeCounterMetric(name string, value float64, labelValues ...string) prometheus.Metric {
66+
return c.makeMetric(name, value, prometheus.CounterValue, labelValues...)
4767
}
4868

49-
func (c *DefaultCollector) makeMetric(metricKey string, value float64, valueType prometheus.ValueType, labelValues ...string) prometheus.Metric {
50-
desc, ok := c.metrics[metricKey]
51-
if !ok {
52-
// we hard panic on this because it's most certainly a coding error
53-
panic(errors.Errorf("undeclared metric '%s'", metricKey))
54-
}
69+
func (c *DefaultCollector) makeMetric(name string, value float64, valueType prometheus.ValueType, labelValues ...string) prometheus.Metric {
70+
desc := c.getDescriptor(name)
5571
metric := prometheus.MustNewConstMetric(desc, valueType, value, labelValues...)
5672
if config.GetBool("enable-timestamps") {
5773
metric = prometheus.NewMetricWithTimestamp(clock.Now(), metric)
5874
}
5975
return metric
6076
}
6177

62-
// Convenience wrapper around Prometheus constructors.
63-
// Produces a metric with name `NAMESPACE_subsystem_name`.
64-
// `NAMESPACE` is a global project constant;
65-
// `subsystem` is an arbitrary name used to group related metrics under the same name prefix;
66-
// `name` is the last and most relevant part of the metrics Full Qualified Name;
67-
// `variableLabels` is a list of labels to declare. Use `nil` to declare no labels.
68-
func NewMetricDesc(subsystem, name, help string, variableLabels []string) *prometheus.Desc {
69-
return prometheus.NewDesc(prometheus.BuildFQName(NAMESPACE, subsystem, name), help, variableLabels, nil)
70-
}
71-
7278
// check that all the given paths exist and are executable files
7379
func CheckExecutables(paths ...string) error {
7480
for _, path := range paths {

ha_cluster_exporter_test.go

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,12 @@ func expectMetrics(t *testing.T, c prometheus.Collector, fixture string) {
3737
}
3838

3939
func TestMetricFactory(t *testing.T) {
40-
SUT := &DefaultCollector{
41-
metrics: metricDescriptors{
42-
"test_metric": NewMetricDesc("test", "metric", "", nil),
43-
},
44-
}
40+
SUT := &DefaultCollector{}
41+
SUT.setDescriptor("test_metric", "", nil)
4542

4643
metric := SUT.makeGaugeMetric("test_metric", 1)
4744

48-
assert.Equal(t, SUT.metrics["test_metric"], metric.Desc())
45+
assert.Equal(t, SUT.getDescriptor("test_metric"), metric.Desc())
4946
}
5047

5148
func TestMetricFactoryWithTimestamp(t *testing.T) {
@@ -56,11 +53,8 @@ func TestMetricFactoryWithTimestamp(t *testing.T) {
5653
}()
5754

5855
clock = StoppedClock{}
59-
SUT := &DefaultCollector{
60-
metrics: metricDescriptors{
61-
"test_metric": NewMetricDesc("test", "metric", "", nil),
62-
},
63-
}
56+
SUT := &DefaultCollector{}
57+
SUT.setDescriptor("test_metric", "", nil)
6458

6559
metric := SUT.makeGaugeMetric("test_metric", 1)
6660
metricDto := &dto.Metric{}

0 commit comments

Comments
 (0)