Skip to content

Commit 52785eb

Browse files
authored
Merge pull request #19 from ClusterLabs/sbd
add SBD device status metrics
2 parents f8f9870 + 245fb3d commit 52785eb

File tree

5 files changed

+325
-7
lines changed

5 files changed

+325
-7
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
module github.com/MalloZup/ha_cluster_exporter
1+
module github.com/ClusterLabs/ha_cluster_exporter
22

33
go 1.12
44

ha_cluster_exporter.go

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ type resource struct {
6969

7070
var (
7171
// corosync metrics
72-
7372
corosyncRingErrorsTotal = prometheus.NewGauge(prometheus.GaugeOpts{
7473
Name: "corosync_ring_errors_total",
7574
Help: "Total number of ring errors in corosync",
@@ -93,6 +92,13 @@ var (
9392

9493
// metrics with labels. (prefer these always as guideline)
9594

95+
// sbd metrics
96+
sbdDevStatus = prometheus.NewGaugeVec(
97+
prometheus.GaugeOpts{
98+
Name: "cluster_sbd_device_status",
99+
Help: "cluster sbd status for each SBD device. 1 is healthy device, 0 is not",
100+
}, []string{"device_name"})
101+
96102
// corosync quorum
97103
corosyncQuorum = prometheus.NewGaugeVec(
98104
prometheus.GaugeOpts{
@@ -122,6 +128,8 @@ func init() {
122128
prometheus.MustRegister(corosyncRingErrorsTotal)
123129
prometheus.MustRegister(corosyncQuorum)
124130
prometheus.MustRegister(corosyncQuorate)
131+
prometheus.MustRegister(sbdDevStatus)
132+
125133
}
126134

127135
// this function is for some cluster metrics which have resource as labels.
@@ -170,7 +178,39 @@ func main() {
170178

171179
// for each different metrics, handle it in differents gorutines, and use same timeout.
172180

173-
// 1a) set corosync metrics: Ring errors
181+
// set SBD device metrics
182+
go func() {
183+
for {
184+
log.Println("[INFO]: Reading cluster SBD configuration..")
185+
// read configuration of SBD
186+
sbdConfiguration, err := readSdbFile()
187+
if err != nil {
188+
log.Panic("couldn't read SBD /etc/sysconfig/sbd config file")
189+
}
190+
// retrieve a list of sbd devices
191+
sbdDevices := getSbdDevices(sbdConfiguration)
192+
// set and return a map of sbd devices with true healthy, false not
193+
sbdStatus := setSbdDeviceHealth(sbdDevices)
194+
195+
if len(sbdStatus) == 0 {
196+
log.Println("[WARN]: Could not retrieve any sbd device")
197+
continue
198+
}
199+
200+
for sbdDev, sbdStatusBool := range sbdStatus {
201+
// true it means the sbd device is healthy
202+
if sbdStatusBool == true {
203+
sbdDevStatus.WithLabelValues(sbdDev).Set(float64(1))
204+
} else {
205+
sbdDevStatus.WithLabelValues(sbdDev).Set(float64(0))
206+
}
207+
}
208+
209+
time.Sleep(time.Duration(int64(*timeoutSeconds)) * time.Second)
210+
}
211+
}()
212+
213+
// set corosync metrics: Ring errors
174214
go func() {
175215
for {
176216
ringStatus := getCorosyncRingStatus()
@@ -184,7 +224,7 @@ func main() {
184224
time.Sleep(time.Duration(int64(*timeoutSeconds)) * time.Second)
185225
}
186226
}()
187-
// 1b) set corosync metrics: quorum metrics
227+
// set corosync metrics: quorum metrics
188228
go func() {
189229
for {
190230
quoromStatus := getQuoromClusterInfo()
@@ -209,7 +249,7 @@ func main() {
209249
time.Sleep(time.Duration(int64(*timeoutSeconds)) * time.Second)
210250
}
211251
}()
212-
// 2) set cluster pacemaker metrics
252+
// set cluster pacemaker metrics
213253
go func() {
214254
for {
215255

sbd_metrics.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
package main
2+
3+
import (
4+
"fmt"
5+
"io/ioutil"
6+
"os"
7+
"os/exec"
8+
"regexp"
9+
"strings"
10+
)
11+
12+
func readSdbFile() ([]byte, error) {
13+
sbdConfFile, err := os.Open("/etc/sysconfig/sbd")
14+
if err != nil {
15+
return nil, fmt.Errorf("[ERROR] Could not open sbd config file %s", err)
16+
}
17+
18+
defer sbdConfFile.Close()
19+
sbdConfigRaw, err := ioutil.ReadAll(sbdConfFile)
20+
21+
if err != nil {
22+
return nil, fmt.Errorf("[ERROR] Could not read sbd config file %s", err)
23+
}
24+
return sbdConfigRaw, nil
25+
}
26+
27+
// return a list of sbd devices that we get from config
28+
func getSbdDevices(sbdConfigRaw []byte) []string {
29+
// in config it can be both SBD_DEVICE="/dev/foo" or SBD_DEVICE=/dev/foo;/dev/bro
30+
wordOnly := regexp.MustCompile("SBD_DEVICE=\"?[a-zA-Z-/;]+\"?")
31+
sbdDevicesConfig := wordOnly.FindString(string(sbdConfigRaw))
32+
// remove the SBD_DEVICE
33+
sbdArray := strings.Split(sbdDevicesConfig, "SBD_DEVICE=")[1]
34+
// make a list of devices by ; seperators and remove double quotes if present
35+
sbdDevices := strings.Split(strings.Trim(sbdArray, "\""), ";")
36+
37+
return sbdDevices
38+
}
39+
40+
// this function take a list of sbd devices and return
41+
// a map of devices with the status, true is healthy , false isn't
42+
func setSbdDeviceHealth(sbdDevices []string) map[string]bool {
43+
sbdStatus := make(map[string]bool)
44+
45+
for _, sbdDev := range sbdDevices {
46+
_, err := exec.Command("sbd", "-d", sbdDev, "dump").Output()
47+
48+
// in case of error the device is not healthy
49+
if err != nil {
50+
sbdStatus[sbdDev] = false
51+
} else {
52+
sbdStatus[sbdDev] = true
53+
}
54+
}
55+
return sbdStatus
56+
}

sbd_metrics_test.go

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
package main
2+
3+
import (
4+
"fmt"
5+
"testing"
6+
)
7+
8+
func TestReadSbdConfFileError(t *testing.T) {
9+
fmt.Println("=== Testing SBD : reading config file")
10+
sbdConfFile, err := readSdbFile()
11+
12+
if sbdConfFile != nil {
13+
t.Errorf("SbdConfig file should be empty in case of error")
14+
}
15+
16+
// we expect that in ci we fail to read the config file
17+
// since there shouldn't be any sbd config in ci
18+
if err == nil {
19+
t.Errorf("Error should be not nil got %s", err)
20+
}
21+
22+
}
23+
24+
func TestGetSbdDevicesWithoutDoubleQuotes(t *testing.T) {
25+
fmt.Println("=== Testing SBD devices retrival from config without quotes")
26+
27+
// this is a full config file more or less , in other tests it is cutted
28+
sbdConfig := `
29+
# SBD_DEVICE specifies the devices to use for exchanging sbd messages
30+
# and to monitor. If specifying more than one path, use ";" as
31+
# separator.
32+
#
33+
#SBD_DEVICE=""
34+
35+
## Type: yesno
36+
## Default: yes
37+
#
38+
# Whether to enable the pacemaker integration.
39+
#
40+
SBD_PACEMAKER=yes
41+
42+
## Type: list(always,clean)
43+
## Default: always
44+
#
45+
# Specify the start mode for sbd. Setting this to "clean" will only
46+
# allow sbd to start if it was not previously fenced. See the -S option
47+
# in the man page.
48+
#
49+
SBD_STARTMODE=always
50+
51+
## Type: yesno / integer
52+
## Default: no
53+
#
54+
# Whether to delay after starting sbd on boot for "msgwait" seconds.
55+
# This may be necessary if your cluster nodes reboot so fast that the
56+
# other nodes are still waiting in the fence acknowledgement phase.
57+
# This is an occasional issue with virtual machines.
58+
#
59+
# This can also be enabled by being set to a specific delay value, in
60+
# seconds. Sometimes a longer delay than the default, "msgwait", is
61+
# needed, for example in the cases where it's considered to be safer to
62+
# wait longer than:
63+
# corosync token timeout + consensus timeout + pcmk_delay_max + msgwait
64+
#
65+
# Be aware that the special value "1" means "yes" rather than "1s".
66+
#
67+
# Consider that you might have to adapt the startup-timeout accordingly
68+
# if the default isn't sufficient. (TimeoutStartSec for systemd)
69+
#
70+
# This option may be ignored at a later point, once pacemaker handles
71+
# this case better.
72+
#
73+
SBD_DELAY_START=no
74+
75+
## Type: string
76+
## Default: /dev/watchdog
77+
#
78+
# Watchdog device to use. If set to /dev/null, no watchdog device will
79+
# be used.
80+
#
81+
SBD_WATCHDOG_DEV=/dev/watchdog
82+
83+
## Type: integer
84+
## Default: 5
85+
#
86+
# How long, in seconds, the watchdog will wait before panicking the
87+
# node if no-one tickles it.
88+
#
89+
# This depends mostly on your storage latency; the majority of devices
90+
# must be successfully read within this time, or else the node will
91+
# self-fence.
92+
#
93+
# If your sbd device(s) reside on a multipath setup or iSCSI, this
94+
# should be the time required to detect a path failure.
95+
#
96+
# Be aware that watchdog timeout set in the on-disk metadata takes
97+
# precedence.
98+
#
99+
SBD_WATCHDOG_TIMEOUT=5
100+
101+
## Type: string
102+
## Default: "flush,reboot"
103+
#
104+
# Actions to be executed when the watchers don't timely report to the sbd
105+
# master process or one of the watchers detects that the master process
106+
# has died.
107+
#
108+
# Set timeout-action to comma-separated combination of
109+
# noflush|flush plus reboot|crashdump|off.
110+
# If just one of both is given the other stays at the default.
111+
#
112+
# This doesn't affect actions like off, crashdump, reboot explicitly
113+
# triggered via message slots.
114+
# And it does as well not configure the action a watchdog would
115+
# trigger should it run off (there is no generic interface).
116+
#
117+
SBD_TIMEOUT_ACTION=flush,reboot
118+
119+
## Type: string
120+
## Default: ""
121+
#
122+
# Additional options for starting sbd
123+
#
124+
SBD_OPTS=
125+
SBD_DEVICE=/dev/vdc;/dev/brother;/dev/syster
126+
`
127+
128+
sbdDevices := getSbdDevices([]byte(sbdConfig))
129+
// we should have 3 devices
130+
expected := "/dev/vdc"
131+
if sbdDevices[0] != expected {
132+
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
133+
}
134+
expected = "/dev/brother"
135+
if sbdDevices[1] != expected {
136+
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
137+
}
138+
139+
expected = "/dev/syster"
140+
if sbdDevices[2] != expected {
141+
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
142+
}
143+
144+
if len(sbdDevices) != 3 {
145+
t.Errorf("length of SbdDevice should be 3 got %d", len(sbdDevices))
146+
}
147+
148+
}
149+
150+
// test the other case with double quotes, and put the string in random place
151+
func TestGetSbdDevicesWithDoubleQuotes(t *testing.T) {
152+
fmt.Println("=== Testing SBD devices retrival from config with Double quotes")
153+
154+
sbdConfig := `## Type: string
155+
## Default: ""
156+
#
157+
# SBD_DEVICE specifies the devices to use for exchanging sbd messages
158+
# and to monitor. If specifying more than one path, use ";" as
159+
# separator.
160+
#
161+
#SBD_DEVICE=""
162+
163+
SBD_WATCHDOG_TIMEOUT=5
164+
165+
SBD_DEVICE="/dev/vdc;/dev/brother;/dev/syster"
166+
167+
SBD_TIMEOUT_ACTION=flush,reboot
168+
169+
## Type: string
170+
## Default: ""
171+
#
172+
# Additional options for starting sbd
173+
#
174+
SBD_OPTS=`
175+
176+
sbdDevices := getSbdDevices([]byte(sbdConfig))
177+
// we should have 3 devices
178+
expected := "/dev/vdc"
179+
if sbdDevices[0] != expected {
180+
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
181+
}
182+
expected = "/dev/brother"
183+
if sbdDevices[1] != expected {
184+
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
185+
}
186+
187+
expected = "/dev/syster"
188+
if sbdDevices[2] != expected {
189+
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
190+
}
191+
192+
if len(sbdDevices) != 3 {
193+
t.Errorf("length of SbdDevice should be 3 got %d", len(sbdDevices))
194+
}
195+
196+
}
197+
198+
// test the other case with double quotes, and put the string in random place
199+
func TestOnlyOneDeviceSbd(t *testing.T) {
200+
fmt.Println("=== Testing Only 1 device")
201+
202+
sbdConfig := `## Type: string
203+
## Default: ""
204+
205+
SBD_DEVICE=/dev/vdc
206+
207+
## Type: string
208+
## Default: "flush,reboot"
209+
`
210+
211+
sbdDevices := getSbdDevices([]byte(sbdConfig))
212+
213+
// we should have 1 device
214+
expected := "/dev/vdc"
215+
if sbdDevices[0] != expected {
216+
t.Errorf("sbdDevice was incorrect, got: %s, expected: %s ", sbdDevices[0], expected)
217+
}
218+
219+
if len(sbdDevices) != 1 {
220+
t.Errorf("length of SbdDevice should be 1 got %d", len(sbdDevices))
221+
}
222+
223+
}

tools/deploy-to-cluster.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@
44

55
# this script is just for deploying the binary to the cluster. Nothing else.
66

7-
node="[email protected].29.106"
7+
node="[email protected].31.221"
88

99
ssh $node "rm /root/ha_cluster_exporter"
1010
echo "copying binary"
1111
scp ha_cluster_exporter $node:
12-

0 commit comments

Comments
 (0)