Skip to content

Commit ee986cc

Browse files
authored
Add split brain metric (#100)
* Implement metric structure * Add documentation in wip and some pseudo implementation * Implement splitbrain metric * update doc * Improve error handling inc case some postifix miss * Refactor and implement configurable path splitbrain * change default drbdsetup-path config value * update doc and metric with refacto * add fake collector * Implement test for metric change location to var/tmp/drbd
1 parent 0ee932a commit ee986cc

File tree

5 files changed

+105
-9
lines changed

5 files changed

+105
-9
lines changed

doc/metric_spec.md

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,8 @@ The DRBD subsystems collect devices stats by parsing its configuration the JSON
206206
0. [Sample](../test/drbd.metrics)
207207
1. [`ha_cluster_drbd_resources`](#ha_cluster_drbd_resources)
208208
2. [`ha_cluster_drbd_connections`](#ha_cluster_drbd_connections)
209-
3. [`ha_cluster_drbd_connections_sync`](#ha_cluster_drbd_connections_sync`)
210-
209+
3. [`ha_cluster_drbd_connections_sync`](#ha_cluster_drbd_connections_sync)
210+
4. [`ha_cluster_drbd_split_brain`](#ha_cluster_drbd_split_brain)
211211

212212
### `ha_cluster_drbd_connections`
213213

@@ -249,3 +249,36 @@ Either the value is `1`, or the line is absent altogether.
249249
- `disk_state`: one of `attaching|failed|negotiating|inconsistent|outdated|dunknown|consistent|uptodate`
250250

251251
The total number of lines for this metric will be the cardinality of `name` times the cardinality of `volume`.
252+
253+
### `ha_cluster_drbd_split_brain`
254+
255+
#### Description
256+
257+
This metric signal if there is a split brain occuring per resource and volume.
258+
Either the value is `1`, or the line is absent altogether.
259+
260+
This metric is a special metric comparing to others, because in order to make this metric working you will need to set a drbd customer split-brain handler. Look at the end
261+
262+
#### Labels
263+
264+
- `resource`: the name of the resource.
265+
- `volume`: the volume number
266+
267+
#### Setting up the DRBD split-brain hook
268+
269+
In order to get the `split_brain` metric working:
270+
271+
1) copy hook into all drbd nodes:
272+
273+
get the hook from:
274+
https://github.com/SUSE/ha-sap-terraform-deployments/blob/72c9d3ecf6c3f6dd18ccb7bcbde4b40722d5c641/salt/drbd_node/files/notify-split-brain-haclusterexporter-suse-metric.sh
275+
276+
2) on the drbd configuration enable the hook:
277+
278+
```split_brain: "/usr/lib/drbd/notify-split-brain-haclusterexporter-suse-metric.sh"`
279+
280+
Refer to upstream doc: https://docs.linbit.com/docs/users-guide-8.4/#s-configure-split-brain-behavior
281+
282+
It is important for the exporter that he hook should create the files in that location and naming.
283+
284+
Remember to remove the files manually after the split brain is solved

drbd_metrics.go

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@ package main
22

33
import (
44
"encoding/json"
5+
"io/ioutil"
56
"os/exec"
7+
"path/filepath"
68
"strconv"
79
"strings"
810

@@ -37,10 +39,11 @@ var (
3739
"resources": NewMetricDesc("drbd", "resources", "The DRBD resources; 1 line per name, per volume", []string{"resource", "role", "volume", "disk_state"}),
3840
"connections": NewMetricDesc("drbd", "connections", "The DRBD resource connections; 1 line per per resource, per peer_node_id", []string{"resource", "peer_node_id", "peer_role", "volume", "peer_disk_state"}),
3941
"connections_sync": NewMetricDesc("drbd", "connections_sync", "The in sync percentage value for DRBD resource connections", []string{"resource", "peer_node_id", "volume"}),
42+
"split_brain": NewMetricDesc("drbd", "split_brain", "Whether a split brain has been detected; 1 line per resource, per volume.", []string{"resource", "volume"}),
4043
}
4144
)
4245

43-
func NewDrbdCollector(drbdSetupPath string) (*drbdCollector, error) {
46+
func NewDrbdCollector(drbdSetupPath string, drbdSplitBrainPath string) (*drbdCollector, error) {
4447
err := CheckExecutables(drbdSetupPath)
4548
if err != nil {
4649
return nil, errors.Wrap(err, "could not initialize DRBD collector")
@@ -51,12 +54,14 @@ func NewDrbdCollector(drbdSetupPath string) (*drbdCollector, error) {
5154
metrics: drbdMetrics,
5255
},
5356
drbdSetupPath,
57+
drbdSplitBrainPath,
5458
}, nil
5559
}
5660

5761
type drbdCollector struct {
5862
DefaultCollector
59-
drbdsetupPath string
63+
drbdsetupPath string
64+
drbdSplitBrainPath string
6065
}
6166

6267
func (c *drbdCollector) Collect(ch chan<- prometheus.Metric) {
@@ -65,6 +70,9 @@ func (c *drbdCollector) Collect(ch chan<- prometheus.Metric) {
6570

6671
log.Infoln("Collecting DRBD metrics...")
6772

73+
// set split brain metric
74+
c.setDrbdSplitBrainMetric(ch)
75+
6876
drbdStatusRaw, err := exec.Command(c.drbdsetupPath, "status", "--json").Output()
6977
if err != nil {
7078
log.Warnf("Error while retrieving drbd infos %s", err)
@@ -101,6 +109,7 @@ func (c *drbdCollector) Collect(ch chan<- prometheus.Metric) {
101109
}
102110
}
103111
}
112+
104113
}
105114

106115
func parseDrbdStatus(statusRaw []byte) ([]drbdStatus, error) {
@@ -111,3 +120,32 @@ func parseDrbdStatus(statusRaw []byte) ([]drbdStatus, error) {
111120
}
112121
return drbdDevs, nil
113122
}
123+
124+
func (c *drbdCollector) setDrbdSplitBrainMetric(ch chan<- prometheus.Metric) {
125+
126+
// set split brain metric
127+
// by default if the custom hook is not set, the exporter will not be able to detect it
128+
files, err := ioutil.ReadDir(c.drbdSplitBrainPath)
129+
if err != nil {
130+
log.Warnf("Error while reading directory %s: %s", c.drbdSplitBrainPath, err)
131+
}
132+
133+
for _, f := range files {
134+
// check if in directory there are file of syntax we expect (nil is when there is not any)
135+
match, _ := filepath.Glob(c.drbdSplitBrainPath + "/drbd-split-brain-detected-*")
136+
if match == nil {
137+
continue
138+
}
139+
resAndVolume := strings.Split(f.Name(), "drbd-split-brain-detected-")[1]
140+
141+
// avoid to have index out range panic error (in case the there is not resource-volume syntax)
142+
if len(strings.Split(resAndVolume, "-")) != 2 {
143+
continue
144+
}
145+
//Resource (0) volume (1) place in slice
146+
resourceAndVolume := strings.Split(resAndVolume, "-")
147+
148+
ch <- c.makeGaugeMetric("split_brain", float64(1), resourceAndVolume[0], resourceAndVolume[1])
149+
150+
}
151+
}

drbd_metrics_test.go

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package main
22

33
import (
4+
"os"
45
"testing"
56
)
67

@@ -152,14 +153,14 @@ func TestDrbdParsing(t *testing.T) {
152153
}
153154

154155
func TestNewDrbdCollector(t *testing.T) {
155-
_, err := NewDrbdCollector("test/fake_drbdsetup.sh")
156+
_, err := NewDrbdCollector("test/fake_drbdsetup.sh", "splitbrainpath")
156157
if err != nil {
157158
t.Errorf("Unexpected error, got: %v", err)
158159
}
159160
}
160161

161162
func TestNewDrbdCollectorChecksDrbdsetupExistence(t *testing.T) {
162-
_, err := NewDrbdCollector("test/nonexistent")
163+
_, err := NewDrbdCollector("test/nonexistent", "splitbrainfake")
163164
if err == nil {
164165
t.Fatal("a non nil error was expected")
165166
}
@@ -169,7 +170,7 @@ func TestNewDrbdCollectorChecksDrbdsetupExistence(t *testing.T) {
169170
}
170171

171172
func TestNewDrbdCollectorChecksDrbdsetupExecutableBits(t *testing.T) {
172-
_, err := NewDrbdCollector("test/dummy")
173+
_, err := NewDrbdCollector("test/dummy", "splibrainfake")
173174
if err == nil {
174175
t.Fatalf("a non nil error was expected")
175176
}
@@ -180,7 +181,26 @@ func TestNewDrbdCollectorChecksDrbdsetupExecutableBits(t *testing.T) {
180181

181182
func TestDRBDCollector(t *testing.T) {
182183
clock = StoppedClock{}
184+
splitBrainDir := "/var/tmp/drbd/splitbrain"
185+
testFiles := [3]string{
186+
"drbd-split-brain-detected-resource01-vol01",
187+
"drbd-split-brain-detected-resource02-vol02",
188+
"drbd-split-brain-detected-missingthingsWrongSkippedMetricS",
189+
}
190+
// create dir for putting temp file if not existings
191+
if _, err := os.Stat(splitBrainDir); os.IsNotExist(err) {
192+
err := os.MkdirAll(splitBrainDir, os.ModePerm)
193+
if err != nil {
194+
t.Errorf("Unexpected error: %v", err)
195+
}
196+
}
183197

184-
collector, _ := NewDrbdCollector("test/fake_drbdsetup.sh")
198+
for _, testFile := range testFiles {
199+
os.Create(splitBrainDir + "/" + testFile)
200+
}
201+
defer os.RemoveAll(splitBrainDir)
202+
203+
collector, _ := NewDrbdCollector("test/fake_drbdsetup.sh", splitBrainDir)
185204
expectMetrics(t, collector, "drbd.metrics")
205+
186206
}

ha_cluster_exporter.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ func init() {
126126
flag.String("sbd-path", "/usr/sbin/sbd", "path to sbd executable")
127127
flag.String("sbd-config-path", "/etc/sysconfig/sbd", "path to sbd configuration")
128128
flag.String("drbdsetup-path", "/sbin/drbdsetup", "path to drbdsetup executable")
129+
flag.String("drbdsplitbrain-path", "/var/run/drbd/splitbrain", "path to drbd splitbrain hooks temporary files")
129130

130131
err := config.BindPFlags(flag.CommandLine)
131132
if err != nil {
@@ -181,7 +182,7 @@ func main() {
181182
log.Info("SBD collector registered")
182183
}
183184

184-
drbdCollector, err := NewDrbdCollector(config.GetString("drbdsetup-path"))
185+
drbdCollector, err := NewDrbdCollector(config.GetString("drbdsetup-path"), config.GetString("drbdsplitbrain-path"))
185186
if err != nil {
186187
log.Warn(err)
187188
} else {

test/drbd.metrics

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,7 @@ ha_cluster_drbd_connections_sync{peer_node_id="1",resource="1-single-1",volume="
1010
# TYPE ha_cluster_drbd_resources gauge
1111
ha_cluster_drbd_resources{disk_state="uptodate",resource="1-single-0",role="Secondary",volume="0"} 1 1234
1212
ha_cluster_drbd_resources{disk_state="uptodate",resource="1-single-1",role="Secondary",volume="0"} 1 1234
13+
# HELP ha_cluster_drbd_split_brain Whether a split brain has been detected; 1 line per resource, per volume.
14+
# TYPE ha_cluster_drbd_split_brain gauge
15+
ha_cluster_drbd_split_brain{resource="resource01",volume="vol01"} 1 1234
16+
ha_cluster_drbd_split_brain{resource="resource02",volume="vol02"} 1 1234

0 commit comments

Comments
 (0)