|
8 | 8 | "net/http" |
9 | 9 | "os/exec" |
10 | 10 | "sort" |
| 11 | + "time" |
11 | 12 |
|
12 | 13 | "github.com/prometheus/client_golang/prometheus" |
13 | 14 | "github.com/prometheus/client_golang/prometheus/promhttp" |
@@ -308,10 +309,10 @@ var ( |
308 | 309 | prometheus.GaugeOpts{ |
309 | 310 | Name: "cluster_resources", |
310 | 311 | Help: "number of cluster resources", |
311 | | - }, []string{"role"}) |
| 312 | + }, []string{"node", "resource_name", "role"}) |
312 | 313 | ) |
313 | 314 |
|
314 | | -func init() { |
| 315 | +func initMetrics() { |
315 | 316 | // Metrics have to be registered to be exposed: |
316 | 317 | prometheus.MustRegister(clusterNodesConf) |
317 | 318 | prometheus.MustRegister(clusterNodesOnline) |
@@ -343,69 +344,94 @@ func init() { |
343 | 344 | var portNumber = flag.String("port", ":9001", "The port number to listen on for HTTP requests.") |
344 | 345 |
|
345 | 346 | func main() { |
| 347 | + // read cli option and setup initial stat |
346 | 348 | flag.Parse() |
347 | | - // get cluster status xml |
348 | | - monxml, err := exec.Command("/usr/sbin/crm_mon", "-1", "--as-xml", "--group-by-node", "--inactive").Output() |
349 | | - if err != nil { |
350 | | - fmt.Println("[ERROR]: crm_mon command was not executed correctly. Did you have crm_mon installed ?") |
351 | | - panic(err) |
352 | | - } |
| 349 | + initMetrics() |
| 350 | + http.Handle("/metrics", promhttp.Handler()) |
353 | 351 |
|
354 | | - var status crmMon |
355 | | - err = xml.Unmarshal(monxml, &status) |
356 | | - if err != nil { |
357 | | - panic(err) |
358 | | - } |
| 352 | + // parse each 2 seconds the cluster configuration and update the metrics accordingly |
| 353 | + // this is done in a goroutine async. we update in this way each 2 second the metrics. (the second will be a parameter in future) |
| 354 | + go func() { |
359 | 355 |
|
360 | | - metrics := parseGenericMetrics(&status) |
361 | | - // add genric node metrics |
362 | | - clusterNodesConf.Set(float64(metrics.Node.Configured)) |
363 | | - clusterNodesOnline.Set(float64(metrics.Node.Online)) |
364 | | - clusterNodesStandby.Set(float64(metrics.Node.Standby)) |
365 | | - clusterNodesStandbyOnFail.Set(float64(metrics.Node.StandbyOnFail)) |
366 | | - clusterNodesMaintenance.Set(float64(metrics.Node.Maintenance)) |
367 | | - clusterNodesPending.Set(float64(metrics.Node.Pending)) |
368 | | - clusterNodesUnclean.Set(float64(metrics.Node.Unclean)) |
369 | | - clusterNodesShutdown.Set(float64(metrics.Node.Shutdown)) |
370 | | - clusterNodesExpectedUp.Set(float64(metrics.Node.ExpectedUp)) |
371 | | - clusterNodesDC.Set(float64(metrics.Node.DC)) |
372 | | - // add genric resource metrics |
373 | | - clusterResourcesUnique.Set(float64(metrics.Resource.Unique)) |
374 | | - clusterResourcesDisabled.Set(float64(metrics.Resource.Disabled)) |
375 | | - clusterResourcesConf.Set(float64(metrics.Resource.Configured)) |
376 | | - clusterResourcesActive.Set(float64(metrics.Resource.Active)) |
377 | | - clusterResourcesOrphaned.Set(float64(metrics.Resource.Orphaned)) |
378 | | - clusterResourcesBlocked.Set(float64(metrics.Resource.Blocked)) |
379 | | - clusterResourcesManaged.Set(float64(metrics.Resource.Managed)) |
380 | | - clusterResourcesFailed.Set(float64(metrics.Resource.Failed)) |
381 | | - clusterResourcesFailedIgnored.Set(float64(metrics.Resource.FailureIgnored)) |
| 356 | + for { |
382 | 357 |
|
383 | | - // metrics with labels |
384 | | - clusterNodes.WithLabelValues("member").Add(float64(metrics.Node.TypeMember)) |
385 | | - clusterNodes.WithLabelValues("ping").Add(float64(metrics.Node.TypePing)) |
386 | | - clusterNodes.WithLabelValues("remote").Add(float64(metrics.Node.TypeRemote)) |
387 | | - clusterNodes.WithLabelValues("unknown").Add(float64(metrics.Node.TypeUnknown)) |
388 | | - |
389 | | - clusterNodes.WithLabelValues("stopped").Add(float64(metrics.Resource.Stopped)) |
390 | | - clusterNodes.WithLabelValues("started").Add(float64(metrics.Resource.Started)) |
391 | | - clusterNodes.WithLabelValues("slave").Add(float64(metrics.Resource.Slave)) |
392 | | - clusterNodes.WithLabelValues("master").Add(float64(metrics.Resource.Master)) |
393 | | - |
394 | | - // TODO: this is historically, we might don't need to do like this. investigate on this later |
395 | | - keys := make([]string, len(metrics.PerNode)) |
396 | | - i := 0 |
397 | | - for k := range metrics.PerNode { |
398 | | - keys[i] = k |
399 | | - i++ |
400 | | - } |
401 | | - sort.Strings(keys) |
402 | | - for _, k := range keys { |
403 | | - node := metrics.PerNode[k] |
404 | | - clusterResourcesRunning.WithLabelValues(k).Add(float64(node.ResourcesRunning)) |
405 | | - } |
| 358 | + var status crmMon |
| 359 | + // get cluster status xml |
| 360 | + fmt.Println("[INFO]: Reading cluster configuration with crm_mon..") |
| 361 | + monxml, err := exec.Command("/usr/sbin/crm_mon", "-1", "--as-xml", "--group-by-node", "--inactive").Output() |
| 362 | + if err != nil { |
| 363 | + fmt.Println("[ERROR]: crm_mon command was not executed correctly. Did you have crm_mon installed ?") |
| 364 | + panic(err) |
| 365 | + } |
| 366 | + |
| 367 | + // read configuration |
| 368 | + err = xml.Unmarshal(monxml, &status) |
| 369 | + if err != nil { |
| 370 | + panic(err) |
| 371 | + } |
| 372 | + |
| 373 | + metrics := parseGenericMetrics(&status) |
| 374 | + // add genric node metrics |
| 375 | + clusterNodesConf.Set(float64(metrics.Node.Configured)) |
| 376 | + clusterNodesOnline.Set(float64(metrics.Node.Online)) |
| 377 | + clusterNodesStandby.Set(float64(metrics.Node.Standby)) |
| 378 | + clusterNodesStandbyOnFail.Set(float64(metrics.Node.StandbyOnFail)) |
| 379 | + clusterNodesMaintenance.Set(float64(metrics.Node.Maintenance)) |
| 380 | + clusterNodesPending.Set(float64(metrics.Node.Pending)) |
| 381 | + clusterNodesUnclean.Set(float64(metrics.Node.Unclean)) |
| 382 | + clusterNodesShutdown.Set(float64(metrics.Node.Shutdown)) |
| 383 | + clusterNodesExpectedUp.Set(float64(metrics.Node.ExpectedUp)) |
| 384 | + clusterNodesDC.Set(float64(metrics.Node.DC)) |
| 385 | + // add genric resource metrics |
| 386 | + clusterResourcesUnique.Set(float64(metrics.Resource.Unique)) |
| 387 | + clusterResourcesDisabled.Set(float64(metrics.Resource.Disabled)) |
| 388 | + clusterResourcesConf.Set(float64(metrics.Resource.Configured)) |
| 389 | + clusterResourcesActive.Set(float64(metrics.Resource.Active)) |
| 390 | + clusterResourcesOrphaned.Set(float64(metrics.Resource.Orphaned)) |
| 391 | + clusterResourcesBlocked.Set(float64(metrics.Resource.Blocked)) |
| 392 | + clusterResourcesManaged.Set(float64(metrics.Resource.Managed)) |
| 393 | + clusterResourcesFailed.Set(float64(metrics.Resource.Failed)) |
| 394 | + clusterResourcesFailedIgnored.Set(float64(metrics.Resource.FailureIgnored)) |
| 395 | + |
| 396 | + // metrics with labels |
| 397 | + clusterNodes.WithLabelValues("member").Set(float64(metrics.Node.TypeMember)) |
| 398 | + clusterNodes.WithLabelValues("ping").Set(float64(metrics.Node.TypePing)) |
| 399 | + clusterNodes.WithLabelValues("remote").Set(float64(metrics.Node.TypeRemote)) |
| 400 | + clusterNodes.WithLabelValues("unknown").Set(float64(metrics.Node.TypeUnknown)) |
| 401 | + |
| 402 | + // TODO: rename this metric with Total etc. |
| 403 | + // clusterResourcesTotal.WithLabelValues("stopped").Add(float64(metrics.Resource.Stopped)) |
| 404 | + // clusterResources.WithLabelValues("started").Add(float64(metrics.Resource.Started)) |
| 405 | + // clusterResources.WithLabelValues("slave").Add(float64(metrics.Resource.Slave)) |
| 406 | + // clusterResources.WithLabelValues("master").Add(float64(metrics.Resource.Master)) |
| 407 | + |
| 408 | + // this will produce a metric like this: |
| 409 | + // cluster_resources{node="dma-dog-hana01" resource_name="RA1" role="master"} 1 |
| 410 | + for _, nod := range status.Nodes.Node { |
| 411 | + for _, rsc := range nod.Resources { |
| 412 | + // TODO: FIXME FIND a mechanism to count the resources: |
| 413 | + clusterResources.WithLabelValues(nod.Name, rsc.ID, rsc.Role).Set(float64(1)) |
| 414 | + } |
| 415 | + } |
| 416 | + |
| 417 | + // TODO: this is historically, we might don't need to do like this. investigate on this later |
| 418 | + keys := make([]string, len(metrics.PerNode)) |
| 419 | + i := 0 |
| 420 | + for k := range metrics.PerNode { |
| 421 | + keys[i] = k |
| 422 | + i++ |
| 423 | + } |
| 424 | + sort.Strings(keys) |
| 425 | + for _, k := range keys { |
| 426 | + node := metrics.PerNode[k] |
| 427 | + clusterResourcesRunning.WithLabelValues(k).Set(float64(node.ResourcesRunning)) |
| 428 | + } |
| 429 | + // TODO: make this configurable later |
| 430 | + time.Sleep(2 * time.Second) |
| 431 | + |
| 432 | + } |
| 433 | + }() |
406 | 434 |
|
407 | | - // serve metrics |
408 | | - http.Handle("/metrics", promhttp.Handler()) |
409 | 435 | fmt.Println("[INFO]: Serving metrics on port", *portNumber) |
410 | 436 | log.Fatal(http.ListenAndServe(*portNumber, nil)) |
411 | 437 | } |
0 commit comments