Skip to content

Commit 32905d7

Browse files
committed
use only labels for metrics
- add seconds flag for timeout of sleep in main loop phases so we can get data, sleep X seconds - implement a mechanism for resetting a metrics with the old values. This mechanism remove all previous values. It is used only for a particular metric. In this way we avoid complex hash retrieval of resources. The reset method is best for deleting resources. The main motivation is that we get a snapshot of data with crm_mon. So we just need to reset the metrics. In theory one could reset them all at each iteration but for sake of simplicity now just reset what is needed. The other metrics use the operation Set which already "reset" the value.
1 parent 03492e6 commit 32905d7

File tree

1 file changed

+39
-88
lines changed

1 file changed

+39
-88
lines changed

main.go

Lines changed: 39 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ type perNodeMetrics struct {
109109
// this historically from hawk-apiserver and parse some generic metrics
110110
// it clusterStaterieve and parse cluster data and counters
111111
func parseGenericMetrics(status *crmMon) *clusterMetrics {
112+
113+
// clusterState save all the xml data . This is the metrics we will convert later to gauge etc.
112114
clusterState := &clusterMetrics{}
113115

114116
clusterState.Node.Configured = status.Summary.Nodes.Number
@@ -191,6 +193,7 @@ func parseGenericMetrics(status *crmMon) *clusterMetrics {
191193
clusterState.Resource.FailureIgnored++
192194
}
193195
}
196+
194197
}
195198

196199
clusterState.Resource.Unique = len(rscIds)
@@ -199,58 +202,7 @@ func parseGenericMetrics(status *crmMon) *clusterMetrics {
199202
}
200203

201204
var (
202-
// simple gauge metric
203-
clusterNodesConf = prometheus.NewGauge(prometheus.GaugeOpts{
204-
Name: "cluster_nodes_configured",
205-
Help: "Number of nodes configured in ha cluster",
206-
})
207-
208-
clusterNodesOnline = prometheus.NewGauge(prometheus.GaugeOpts{
209-
Name: "cluster_nodes_online",
210-
Help: "Number of nodes online in ha cluster",
211-
})
212-
213-
clusterNodesStandby = prometheus.NewGauge(prometheus.GaugeOpts{
214-
Name: "cluster_nodes_standby",
215-
Help: "Number of nodes standby in ha cluster",
216-
})
217-
218-
clusterNodesStandbyOnFail = prometheus.NewGauge(prometheus.GaugeOpts{
219-
Name: "cluster_nodes_stanby_onfail",
220-
Help: "Number of nodes standby onfail in ha cluster",
221-
})
222-
223-
clusterNodesMaintenance = prometheus.NewGauge(prometheus.GaugeOpts{
224-
Name: "cluster_nodes_maintenance",
225-
Help: "Number of nodes in maintainance in ha cluster",
226-
})
227-
228-
clusterNodesPending = prometheus.NewGauge(prometheus.GaugeOpts{
229-
Name: "cluster_nodes_pending",
230-
Help: "Number of nodes pending in ha cluster",
231-
})
232-
233-
clusterNodesUnclean = prometheus.NewGauge(prometheus.GaugeOpts{
234-
Name: "cluster_nodes_unclean",
235-
Help: "Number of nodes unclean in ha cluster",
236-
})
237-
238-
clusterNodesShutdown = prometheus.NewGauge(prometheus.GaugeOpts{
239-
Name: "cluster_nodes_shutdown",
240-
Help: "Number of nodes shutdown in ha cluster",
241-
})
242-
243-
clusterNodesExpectedUp = prometheus.NewGauge(prometheus.GaugeOpts{
244-
Name: "cluster_nodes_expected_up",
245-
Help: "Number of nodes expected up in ha cluster",
246-
})
247-
248-
clusterNodesDC = prometheus.NewGauge(prometheus.GaugeOpts{
249-
Name: "cluster_nodes_expected_dc",
250-
Help: "Number of nodes dc in ha cluster",
251-
})
252-
253-
// a gauge metric with label
205+
// metrics with labels. (prefer these always as guideline)
254206
clusterNodes = prometheus.NewGaugeVec(
255207
prometheus.GaugeOpts{
256208
Name: "cluster_nodes",
@@ -262,7 +214,7 @@ var (
262214
Name: "cluster_resources_running",
263215
Help: "number of cluster resources running",
264216
}, []string{"node"})
265-
217+
// TODO: rename this to nodeResource
266218
clusterResources = prometheus.NewGaugeVec(
267219
prometheus.GaugeOpts{
268220
Name: "cluster_resources",
@@ -277,27 +229,16 @@ var (
277229
)
278230

279231
func initMetrics() {
280-
// Metrics have to be registered to be exposed:
281-
prometheus.MustRegister(clusterNodesConf)
282-
prometheus.MustRegister(clusterNodesOnline)
283-
prometheus.MustRegister(clusterNodesStandby)
284-
prometheus.MustRegister(clusterNodesStandbyOnFail)
285-
prometheus.MustRegister(clusterNodesMaintenance)
286-
prometheus.MustRegister(clusterNodesPending)
287-
prometheus.MustRegister(clusterNodesUnclean)
288-
prometheus.MustRegister(clusterNodesShutdown)
289-
prometheus.MustRegister(clusterNodesExpectedUp)
290-
prometheus.MustRegister(clusterNodesDC)
291-
292-
// metrics with labels
232+
293233
prometheus.MustRegister(clusterNodes)
234+
// resources TODO: this 3 metrics can be refactored
294235
prometheus.MustRegister(clusterResourcesRunning)
295236
prometheus.MustRegister(clusterResources)
296237
prometheus.MustRegister(clusterResourcesStatus)
297-
298238
}
299239

300240
var portNumber = flag.String("port", ":9001", "The port number to listen on for HTTP requests.")
241+
var timeoutSeconds = flag.Int("timeout", 5, "timeout seconds for exporter to wait to fetch new data")
301242

302243
func main() {
303244
// read cli option and setup initial stat
@@ -311,7 +252,17 @@ func main() {
311252

312253
for {
313254

314-
var status crmMon
255+
// We want to reset certains metrics to 0 each time for removing the state.
256+
// since we have complex/nested metrics with multiples labels, unregistering/re-registering is the cleanest way.
257+
prometheus.Unregister(clusterResources)
258+
// overwrite metric with an empty one
259+
clusterResources := prometheus.NewGaugeVec(
260+
prometheus.GaugeOpts{
261+
Name: "cluster_resources",
262+
Help: "number of cluster resources",
263+
}, []string{"node", "resource_name", "role"})
264+
prometheus.MustRegister(clusterResources)
265+
315266
// get cluster status xml
316267
fmt.Println("[INFO]: Reading cluster configuration with crm_mon..")
317268
monxml, err := exec.Command("/usr/sbin/crm_mon", "-1", "--as-xml", "--group-by-node", "--inactive").Output()
@@ -321,23 +272,14 @@ func main() {
321272
}
322273

323274
// read configuration
275+
var status crmMon
324276
err = xml.Unmarshal(monxml, &status)
325277
if err != nil {
278+
fmt.Println("[ERROR]: could not read cluster XML configuration")
326279
panic(err)
327280
}
328281

329282
metrics := parseGenericMetrics(&status)
330-
// add genric node metrics
331-
clusterNodesConf.Set(float64(metrics.Node.Configured))
332-
clusterNodesOnline.Set(float64(metrics.Node.Online))
333-
clusterNodesStandby.Set(float64(metrics.Node.Standby))
334-
clusterNodesStandbyOnFail.Set(float64(metrics.Node.StandbyOnFail))
335-
clusterNodesMaintenance.Set(float64(metrics.Node.Maintenance))
336-
clusterNodesPending.Set(float64(metrics.Node.Pending))
337-
clusterNodesUnclean.Set(float64(metrics.Node.Unclean))
338-
clusterNodesShutdown.Set(float64(metrics.Node.Shutdown))
339-
clusterNodesExpectedUp.Set(float64(metrics.Node.ExpectedUp))
340-
clusterNodesDC.Set(float64(metrics.Node.DC))
341283

342284
// ressouce status metrics (TODO: rename it to total instead of status T)
343285
clusterResourcesStatus.WithLabelValues("unique").Set(float64(metrics.Resource.Unique))
@@ -350,27 +292,37 @@ func main() {
350292
clusterResourcesStatus.WithLabelValues("failed").Set(float64(metrics.Resource.Failed))
351293
clusterResourcesStatus.WithLabelValues("failed_ignored").Set(float64(metrics.Resource.FailureIgnored))
352294
clusterResourcesStatus.WithLabelValues("stopped").Set(float64(metrics.Resource.Stopped))
353-
clusterResourcesStatus.WithLabelValues("started").Set(float64(metrics.Resource.Stopped))
295+
clusterResourcesStatus.WithLabelValues("started").Set(float64(metrics.Resource.Started))
354296
clusterResourcesStatus.WithLabelValues("slave").Set(float64(metrics.Resource.Slave))
355-
clusterResourcesStatus.WithLabelValues("master").Add(float64(metrics.Resource.Master))
297+
clusterResourcesStatus.WithLabelValues("master").Set(float64(metrics.Resource.Master))
356298

357299
// metrics with labels
358300
clusterNodes.WithLabelValues("member").Set(float64(metrics.Node.TypeMember))
359301
clusterNodes.WithLabelValues("ping").Set(float64(metrics.Node.TypePing))
360302
clusterNodes.WithLabelValues("remote").Set(float64(metrics.Node.TypeRemote))
361303
clusterNodes.WithLabelValues("unknown").Set(float64(metrics.Node.TypeUnknown))
304+
clusterNodes.WithLabelValues("configured").Set(float64(metrics.Node.Configured))
305+
clusterNodes.WithLabelValues("online").Set(float64(metrics.Node.Online))
306+
clusterNodes.WithLabelValues("standby").Set(float64(metrics.Node.Standby))
307+
clusterNodes.WithLabelValues("standby_onfail").Set(float64(metrics.Node.StandbyOnFail))
308+
clusterNodes.WithLabelValues("maintenance").Set(float64(metrics.Node.Maintenance))
309+
clusterNodes.WithLabelValues("pending").Set(float64(metrics.Node.Pending))
310+
clusterNodes.WithLabelValues("unclean").Set(float64(metrics.Node.Unclean))
311+
clusterNodes.WithLabelValues("shutdown").Set(float64(metrics.Node.Shutdown))
312+
clusterNodes.WithLabelValues("expected_up").Set(float64(metrics.Node.ExpectedUp))
313+
clusterNodes.WithLabelValues("DC").Set(float64(metrics.Node.DC))
362314

363315
// this will produce a metric like this:
364316
// cluster_resources{node="dma-dog-hana01" resource_name="RA1" role="master"} 1
365317
for _, nod := range status.Nodes.Node {
366318
for _, rsc := range nod.Resources {
367-
// TODO: FIXME FIND a mechanism to count the resources:
368-
// gauge2, err := pipelineCountMetric.GetMetricWithLabelValues("pipeline2")
369-
clusterResources.WithLabelValues(nod.Name, rsc.ID, rsc.Role).Set(float64(1))
319+
// if there is the same resource just add it. At each iteration it will be destroyed this metric so
320+
// this is safe.
321+
clusterResources.WithLabelValues(nod.Name, rsc.ID, rsc.Role).Inc()
370322
}
371323
}
372-
373324
// TODO: this is historically, we might don't need to do like this. investigate on this later
325+
// this can be improved in a more simple way or even removed
374326
keys := make([]string, len(metrics.PerNode))
375327
i := 0
376328
for k := range metrics.PerNode {
@@ -381,14 +333,13 @@ func main() {
381333
for _, k := range keys {
382334
node := metrics.PerNode[k]
383335
clusterResourcesRunning.WithLabelValues(k).Set(float64(node.ResourcesRunning))
384-
385336
}
386-
// TODO: make this configurable later
387-
time.Sleep(2 * time.Second)
388337

338+
time.Sleep(time.Duration(int64(*timeoutSeconds)) * time.Second)
389339
}
390340
}()
391341

392342
fmt.Println("[INFO]: Serving metrics on port", *portNumber)
343+
fmt.Println("[INFO]: refreshing metric timeouts set to", *timeoutSeconds)
393344
log.Fatal(http.ListenAndServe(*portNumber, nil))
394345
}

0 commit comments

Comments
 (0)