@@ -109,6 +109,8 @@ type perNodeMetrics struct {
109109// this historically from hawk-apiserver and parse some generic metrics
110110// it clusterStaterieve and parse cluster data and counters
111111func parseGenericMetrics (status * crmMon ) * clusterMetrics {
112+
113+ // clusterState save all the xml data . This is the metrics we will convert later to gauge etc.
112114 clusterState := & clusterMetrics {}
113115
114116 clusterState .Node .Configured = status .Summary .Nodes .Number
@@ -191,6 +193,7 @@ func parseGenericMetrics(status *crmMon) *clusterMetrics {
191193 clusterState .Resource .FailureIgnored ++
192194 }
193195 }
196+
194197 }
195198
196199 clusterState .Resource .Unique = len (rscIds )
@@ -199,58 +202,7 @@ func parseGenericMetrics(status *crmMon) *clusterMetrics {
199202}
200203
201204var (
202- // simple gauge metric
203- clusterNodesConf = prometheus .NewGauge (prometheus.GaugeOpts {
204- Name : "cluster_nodes_configured" ,
205- Help : "Number of nodes configured in ha cluster" ,
206- })
207-
208- clusterNodesOnline = prometheus .NewGauge (prometheus.GaugeOpts {
209- Name : "cluster_nodes_online" ,
210- Help : "Number of nodes online in ha cluster" ,
211- })
212-
213- clusterNodesStandby = prometheus .NewGauge (prometheus.GaugeOpts {
214- Name : "cluster_nodes_standby" ,
215- Help : "Number of nodes standby in ha cluster" ,
216- })
217-
218- clusterNodesStandbyOnFail = prometheus .NewGauge (prometheus.GaugeOpts {
219- Name : "cluster_nodes_stanby_onfail" ,
220- Help : "Number of nodes standby onfail in ha cluster" ,
221- })
222-
223- clusterNodesMaintenance = prometheus .NewGauge (prometheus.GaugeOpts {
224- Name : "cluster_nodes_maintenance" ,
225- Help : "Number of nodes in maintainance in ha cluster" ,
226- })
227-
228- clusterNodesPending = prometheus .NewGauge (prometheus.GaugeOpts {
229- Name : "cluster_nodes_pending" ,
230- Help : "Number of nodes pending in ha cluster" ,
231- })
232-
233- clusterNodesUnclean = prometheus .NewGauge (prometheus.GaugeOpts {
234- Name : "cluster_nodes_unclean" ,
235- Help : "Number of nodes unclean in ha cluster" ,
236- })
237-
238- clusterNodesShutdown = prometheus .NewGauge (prometheus.GaugeOpts {
239- Name : "cluster_nodes_shutdown" ,
240- Help : "Number of nodes shutdown in ha cluster" ,
241- })
242-
243- clusterNodesExpectedUp = prometheus .NewGauge (prometheus.GaugeOpts {
244- Name : "cluster_nodes_expected_up" ,
245- Help : "Number of nodes expected up in ha cluster" ,
246- })
247-
248- clusterNodesDC = prometheus .NewGauge (prometheus.GaugeOpts {
249- Name : "cluster_nodes_expected_dc" ,
250- Help : "Number of nodes dc in ha cluster" ,
251- })
252-
253- // a gauge metric with label
205+ // metrics with labels. (prefer these always as guideline)
254206 clusterNodes = prometheus .NewGaugeVec (
255207 prometheus.GaugeOpts {
256208 Name : "cluster_nodes" ,
@@ -262,7 +214,7 @@ var (
262214 Name : "cluster_resources_running" ,
263215 Help : "number of cluster resources running" ,
264216 }, []string {"node" })
265-
217+ // TODO: rename this to nodeResource
266218 clusterResources = prometheus .NewGaugeVec (
267219 prometheus.GaugeOpts {
268220 Name : "cluster_resources" ,
@@ -277,27 +229,16 @@ var (
277229)
278230
279231func initMetrics () {
280- // Metrics have to be registered to be exposed:
281- prometheus .MustRegister (clusterNodesConf )
282- prometheus .MustRegister (clusterNodesOnline )
283- prometheus .MustRegister (clusterNodesStandby )
284- prometheus .MustRegister (clusterNodesStandbyOnFail )
285- prometheus .MustRegister (clusterNodesMaintenance )
286- prometheus .MustRegister (clusterNodesPending )
287- prometheus .MustRegister (clusterNodesUnclean )
288- prometheus .MustRegister (clusterNodesShutdown )
289- prometheus .MustRegister (clusterNodesExpectedUp )
290- prometheus .MustRegister (clusterNodesDC )
291-
292- // metrics with labels
232+
293233 prometheus .MustRegister (clusterNodes )
234+ // resources TODO: this 3 metrics can be refactored
294235 prometheus .MustRegister (clusterResourcesRunning )
295236 prometheus .MustRegister (clusterResources )
296237 prometheus .MustRegister (clusterResourcesStatus )
297-
298238}
299239
300240var portNumber = flag .String ("port" , ":9001" , "The port number to listen on for HTTP requests." )
241+ var timeoutSeconds = flag .Int ("timeout" , 5 , "timeout seconds for exporter to wait to fetch new data" )
301242
302243func main () {
303244 // read cli option and setup initial stat
@@ -311,7 +252,17 @@ func main() {
311252
312253 for {
313254
314- var status crmMon
255+ // We want to reset certains metrics to 0 each time for removing the state.
256+ // since we have complex/nested metrics with multiples labels, unregistering/re-registering is the cleanest way.
257+ prometheus .Unregister (clusterResources )
258+ // overwrite metric with an empty one
259+ clusterResources := prometheus .NewGaugeVec (
260+ prometheus.GaugeOpts {
261+ Name : "cluster_resources" ,
262+ Help : "number of cluster resources" ,
263+ }, []string {"node" , "resource_name" , "role" })
264+ prometheus .MustRegister (clusterResources )
265+
315266 // get cluster status xml
316267 fmt .Println ("[INFO]: Reading cluster configuration with crm_mon.." )
317268 monxml , err := exec .Command ("/usr/sbin/crm_mon" , "-1" , "--as-xml" , "--group-by-node" , "--inactive" ).Output ()
@@ -321,23 +272,14 @@ func main() {
321272 }
322273
323274 // read configuration
275+ var status crmMon
324276 err = xml .Unmarshal (monxml , & status )
325277 if err != nil {
278+ fmt .Println ("[ERROR]: could not read cluster XML configuration" )
326279 panic (err )
327280 }
328281
329282 metrics := parseGenericMetrics (& status )
330- // add genric node metrics
331- clusterNodesConf .Set (float64 (metrics .Node .Configured ))
332- clusterNodesOnline .Set (float64 (metrics .Node .Online ))
333- clusterNodesStandby .Set (float64 (metrics .Node .Standby ))
334- clusterNodesStandbyOnFail .Set (float64 (metrics .Node .StandbyOnFail ))
335- clusterNodesMaintenance .Set (float64 (metrics .Node .Maintenance ))
336- clusterNodesPending .Set (float64 (metrics .Node .Pending ))
337- clusterNodesUnclean .Set (float64 (metrics .Node .Unclean ))
338- clusterNodesShutdown .Set (float64 (metrics .Node .Shutdown ))
339- clusterNodesExpectedUp .Set (float64 (metrics .Node .ExpectedUp ))
340- clusterNodesDC .Set (float64 (metrics .Node .DC ))
341283
342284 // ressouce status metrics (TODO: rename it to total instead of status T)
343285 clusterResourcesStatus .WithLabelValues ("unique" ).Set (float64 (metrics .Resource .Unique ))
@@ -350,27 +292,37 @@ func main() {
350292 clusterResourcesStatus .WithLabelValues ("failed" ).Set (float64 (metrics .Resource .Failed ))
351293 clusterResourcesStatus .WithLabelValues ("failed_ignored" ).Set (float64 (metrics .Resource .FailureIgnored ))
352294 clusterResourcesStatus .WithLabelValues ("stopped" ).Set (float64 (metrics .Resource .Stopped ))
353- clusterResourcesStatus .WithLabelValues ("started" ).Set (float64 (metrics .Resource .Stopped ))
295+ clusterResourcesStatus .WithLabelValues ("started" ).Set (float64 (metrics .Resource .Started ))
354296 clusterResourcesStatus .WithLabelValues ("slave" ).Set (float64 (metrics .Resource .Slave ))
355- clusterResourcesStatus .WithLabelValues ("master" ).Add (float64 (metrics .Resource .Master ))
297+ clusterResourcesStatus .WithLabelValues ("master" ).Set (float64 (metrics .Resource .Master ))
356298
357299 // metrics with labels
358300 clusterNodes .WithLabelValues ("member" ).Set (float64 (metrics .Node .TypeMember ))
359301 clusterNodes .WithLabelValues ("ping" ).Set (float64 (metrics .Node .TypePing ))
360302 clusterNodes .WithLabelValues ("remote" ).Set (float64 (metrics .Node .TypeRemote ))
361303 clusterNodes .WithLabelValues ("unknown" ).Set (float64 (metrics .Node .TypeUnknown ))
304+ clusterNodes .WithLabelValues ("configured" ).Set (float64 (metrics .Node .Configured ))
305+ clusterNodes .WithLabelValues ("online" ).Set (float64 (metrics .Node .Online ))
306+ clusterNodes .WithLabelValues ("standby" ).Set (float64 (metrics .Node .Standby ))
307+ clusterNodes .WithLabelValues ("standby_onfail" ).Set (float64 (metrics .Node .StandbyOnFail ))
308+ clusterNodes .WithLabelValues ("maintenance" ).Set (float64 (metrics .Node .Maintenance ))
309+ clusterNodes .WithLabelValues ("pending" ).Set (float64 (metrics .Node .Pending ))
310+ clusterNodes .WithLabelValues ("unclean" ).Set (float64 (metrics .Node .Unclean ))
311+ clusterNodes .WithLabelValues ("shutdown" ).Set (float64 (metrics .Node .Shutdown ))
312+ clusterNodes .WithLabelValues ("expected_up" ).Set (float64 (metrics .Node .ExpectedUp ))
313+ clusterNodes .WithLabelValues ("DC" ).Set (float64 (metrics .Node .DC ))
362314
363315 // this will produce a metric like this:
364316 // cluster_resources{node="dma-dog-hana01" resource_name="RA1" role="master"} 1
365317 for _ , nod := range status .Nodes .Node {
366318 for _ , rsc := range nod .Resources {
367- // TODO: FIXME FIND a mechanism to count the resources:
368- // gauge2, err := pipelineCountMetric.GetMetricWithLabelValues("pipeline2")
369- clusterResources .WithLabelValues (nod .Name , rsc .ID , rsc .Role ).Set ( float64 ( 1 ) )
319+ // if there is the same resource just add it. At each iteration it will be destroyed this metric so
320+ // this is safe.
321+ clusterResources .WithLabelValues (nod .Name , rsc .ID , rsc .Role ).Inc ( )
370322 }
371323 }
372-
373324 // TODO: this is historically, we might don't need to do like this. investigate on this later
325+ // this can be improved in a more simple way or even removed
374326 keys := make ([]string , len (metrics .PerNode ))
375327 i := 0
376328 for k := range metrics .PerNode {
@@ -381,14 +333,13 @@ func main() {
381333 for _ , k := range keys {
382334 node := metrics .PerNode [k ]
383335 clusterResourcesRunning .WithLabelValues (k ).Set (float64 (node .ResourcesRunning ))
384-
385336 }
386- // TODO: make this configurable later
387- time .Sleep (2 * time .Second )
388337
338+ time .Sleep (time .Duration (int64 (* timeoutSeconds )) * time .Second )
389339 }
390340 }()
391341
392342 fmt .Println ("[INFO]: Serving metrics on port" , * portNumber )
343+ fmt .Println ("[INFO]: refreshing metric timeouts set to" , * timeoutSeconds )
393344 log .Fatal (http .ListenAndServe (* portNumber , nil ))
394345}
0 commit comments