From 67f675f83752674e9a5de819f4814cc017826140 Mon Sep 17 00:00:00 2001 From: jswxstw Date: Tue, 28 Jan 2025 16:23:21 +0800 Subject: [PATCH] docs: fix the table display issue in metrics.md (#14116) Signed-off-by: oninowang Signed-off-by: Alan Clucas Co-authored-by: Alan Clucas --- docs/metrics.md | 28 ++++++++++++++++++++++++++++ util/telemetry/builder/docs.go | 5 ++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/docs/metrics.md b/docs/metrics.md index fd3d0b000733..43b4da5d4b7b 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -247,6 +247,7 @@ Metrics for the [Four Golden Signals](https://sre.google/sre-book/monitoring-dis #### `cronworkflows_concurrencypolicy_triggered` A counter of the number of times a CronWorkflow has triggered its `concurrencyPolicy` to limit the number of workflows running. + | attribute | explanation | |----------------------|----------------------------------------------------------------------------------| | `name` | ⚠️ The name of the CronWorkflow | @@ -257,6 +258,7 @@ A counter of the number of times a CronWorkflow has triggered its `concurrencyPo A counter of the total number of times a CronWorkflow has been triggered. Suppressed runs due to `concurrencyPolicy: Forbid` will not be counted. + | attribute | explanation | |-------------|-------------------------------------------| | `name` | ⚠️ The name of the CronWorkflow | @@ -267,6 +269,7 @@ Suppressed runs due to `concurrencyPolicy: Forbid` will not be counted. Incidents of deprecated feature being used. Deprecated features are [explained here](deprecations.md). 🚨 This counter may go up much more than once for a single use of the feature. + | attribute | explanation | |-------------|---------------------------------------| | `feature` | The name of the feature used | @@ -282,6 +285,7 @@ Deprecated features are [explained here](deprecations.md). #### `error_count` A counter of certain errors incurred by the controller by cause. + | attribute | explanation | |-----------|------------------------| | `cause` | The cause of the error | @@ -297,6 +301,7 @@ The currently tracked specific errors are A gauge of the number of workflows currently in the cluster in each phase. The `Running` count does not mean that a workflows pods are running, just that the controller has scheduled them. A workflow can be stuck in `Running` with pending pods for a long time. + | attribute | explanation | |-----------|----------------------------| | `status` | Boolean: `true` or `false` | @@ -308,11 +313,13 @@ A gauge indicating if this Controller is the [leader](high-availability.md#workf - `1` if leader or in standalone mode via [`LEADER_ELECTION_DISABLE=true`](environment-variables.md#controller). - `0` otherwise, indicating that this controller is a standby that is not currently running workflows. + This metric has no attributes. #### `k8s_request_duration` A histogram recording the API requests sent to the Kubernetes API. + | attribute | explanation | |---------------|--------------------------------------------------------------------| | `kind` | The kubernetes `kind` involved in the request such as `configmaps` | @@ -325,6 +332,7 @@ This contains all the information contained in `k8s_request_total` along with ti #### `k8s_request_total` A counter of the number of API requests sent to the Kubernetes API. + | attribute | explanation | |---------------|--------------------------------------------------------------------| | `kind` | The kubernetes `kind` involved in the request such as `configmaps` | @@ -336,6 +344,7 @@ This metric is calculable from `k8s_request_duration`, and it is suggested you j #### `log_messages` A count of log messages emitted by the controller by log level: `error`, `warn` and `info`. + | attribute | explanation | |-----------|------------------------------| | `level` | The log level of the message | @@ -345,7 +354,9 @@ A count of log messages emitted by the controller by log level: `error`, `warn` A histogram of durations of operations. An operation is a single workflow reconciliation loop within the workflow-controller. It's the time for the controller to process a single workflow after it has been read from the cluster and is a measure of the performance of the controller affected by the complexity of the workflow. + This metric has no attributes. + The environment variables `OPERATION_DURATION_METRIC_BUCKET_COUNT` and `MAX_OPERATION_TIME` configure the bucket sizes for this metric, unless they are specified using an `histogramBuckets` modifier in the `metricsConfig` block. #### `pod_missing` @@ -353,6 +364,7 @@ The environment variables `OPERATION_DURATION_METRIC_BUCKET_COUNT` and `MAX_OPER Incidents of pod missing. A counter of pods that were not seen - for example they are by being deleted by Kubernetes. You should only see this under high load. + | attribute | explanation | |--------------------|----------------------------------------| | `node_phase` | The phase that the pod's node was in | @@ -363,6 +375,7 @@ You should only see this under high load. #### `pod_pending_count` Total number of pods that started pending by reason. + | attribute | explanation | |-------------|----------------------------------------------| | `reason` | Summary of the kubernetes Reason for pending | @@ -373,6 +386,7 @@ Total number of pods that started pending by reason. A gauge of the number of workflow created pods currently in the cluster in each phase. It is possible for a workflow to start, but no pods be running (for example cluster is too busy to run them). This metric sheds light on actual work being done. + | attribute | explanation | |-----------|------------------------------| | `phase` | The phase that the pod is in | @@ -380,6 +394,7 @@ This metric sheds light on actual work being done. #### `pods_total_count` Total number of pods that have entered each phase. + | attribute | explanation | |-------------|----------------------------------| | `phase` | The phase that the pod is in | @@ -393,6 +408,7 @@ This is not directly controlled by the workflow controller, so it is possible fo A counter of additions to the work queues inside the controller. The rate of this shows how busy that area of the controller is + | attribute | explanation | |--------------|-----------------------| | `queue_name` | The name of the queue | @@ -411,6 +427,7 @@ This and associated metrics are all directly sourced from the [client-go workque A gauge of the current depth of the queues. If these get large then the workflow controller is not keeping up with the cluster. + | attribute | explanation | |--------------|-----------------------| | `queue_name` | The name of the queue | @@ -428,6 +445,7 @@ This and associated metrics are all directly sourced from the [client-go workque #### `queue_duration` A histogram of the time events in the queues are taking to be processed. + | attribute | explanation | |--------------|-----------------------| | `queue_name` | The name of the queue | @@ -446,6 +464,7 @@ This and associated metrics are all directly sourced from the [client-go workque #### `queue_latency` A histogram of the time events in the queues are taking before they are processed. + | attribute | explanation | |--------------|-----------------------| | `queue_name` | The name of the queue | @@ -464,6 +483,7 @@ This and associated metrics are all directly sourced from the [client-go workque #### `queue_longest_running` A gauge of the number of seconds that this queue's longest running processor has been running for. + | attribute | explanation | |--------------|-----------------------| | `queue_name` | The name of the queue | @@ -481,6 +501,7 @@ This and associated metrics are all directly sourced from the [client-go workque #### `queue_retries` A counter of the number of times a message has been retried in the queue. + | attribute | explanation | |--------------|-----------------------| | `queue_name` | The name of the queue | @@ -498,6 +519,7 @@ This and associated metrics are all directly sourced from the [client-go workque #### `queue_unfinished_work` A gauge of the number of queue items that have not been processed yet. + | attribute | explanation | |--------------|-----------------------| | `queue_name` | The name of the queue | @@ -515,6 +537,7 @@ This and associated metrics are all directly sourced from the [client-go workque #### `total_count` A counter of workflows that have entered each phase for tracking them through their life-cycle, by namespace. + | attribute | explanation | |-------------|-----------------------------------------| | `phase` | The phase that the Workflow has entered | @@ -523,6 +546,7 @@ A counter of workflows that have entered each phase for tracking them through th #### `version` Build metadata for this Controller. + | attribute | explanation | |------------------|-------------------------------------------------------------------------------------------------------| | `version` | The version of Argo | @@ -537,6 +561,7 @@ Build metadata for this Controller. #### `workers_busy_count` A gauge of queue workers that are busy. + | attribute | explanation | |---------------|-------------------| | `worker_type` | The type of queue | @@ -555,6 +580,7 @@ This and associated metrics are all directly sourced from the [client-go workque A gauge of the number of workflows with different conditions. This will tell you the number of workflows with running pods. + | attribute | explanation | |-----------|----------------------------------------------------| | `type` | The type of condition, currently only `PodRunning` | @@ -565,6 +591,7 @@ This will tell you the number of workflows with running pods. A histogram of the runtime of workflows using `workflowTemplateRef` only. Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. Records time between entering the `Running` phase and completion, so does not include any time in `Pending`. + | attribute | explanation | |-----------------|-------------------------------------------------------------| | `name` | ⚠️ The name of the WorkflowTemplate/ClusterWorkflowTemplate. | @@ -575,6 +602,7 @@ Records time between entering the `Running` phase and completion, so does not in A counter of workflows using `workflowTemplateRef` only, as they enter each phase. Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. + | attribute | explanation | |-----------------|-------------------------------------------------------------| | `name` | ⚠️ The name of the WorkflowTemplate/ClusterWorkflowTemplate. | diff --git a/util/telemetry/builder/docs.go b/util/telemetry/builder/docs.go index 70e1bb8bd7c8..2c51c56fce30 100644 --- a/util/telemetry/builder/docs.go +++ b/util/telemetry/builder/docs.go @@ -64,14 +64,15 @@ func metricsDocsLines(metrics *metricsList, attribs *attributesList) string { var out bytes.Buffer outWriter := io.Writer(&out) markdown := md.NewMarkdown(outWriter) + markdown.PlainText("") for _, metric := range *metrics { - markdown.PlainText("") markdown.H4(md.Code(metric.displayName())) markdown.PlainText("") markdown.PlainTextf("%s.", metric.Description) if metric.ExtendedDescription != "" { markdown.PlainText(strings.Trim(metric.ExtendedDescription, " \n\t\r")) } + markdown.PlainText("") if len(metric.Attributes) > 0 { rows := [][]string{} @@ -89,6 +90,7 @@ func metricsDocsLines(metrics *metricsList, attribs *attributesList) string { ) } else { markdown.PlainText("This metric has no attributes.") + markdown.PlainText("") } if len(metric.DefaultBuckets) > 0 { buckets := "" @@ -103,6 +105,7 @@ func metricsDocsLines(metrics *metricsList, attribs *attributesList) string { if metric.Notes != "" { markdown.PlainText(strings.Trim(metric.Notes, " \n\t\r")) + markdown.PlainText("") } } markdown.Build()