Skip to content

Commit

Permalink
[Feature][prometheus] add flink kubernetes operator metrics collect (#…
Browse files Browse the repository at this point in the history
…746)

* feature: update carp-security

* feature: update flink kubernetes operator 部署

* feature: update flink kubernetes operator 部署

* feature: update flink kubernetes operator 部署

* feature: update flink kubernetes operator 部署

* feature: update flink kubernetes operator 部署

* feature: update flink kubernetes operator 部署

* feature: update prometheus operator 部署

* feature: update prometheus operator 部署

* feature: update prometheus operator 部署

---------

Co-authored-by: wangqi <[email protected]>
  • Loading branch information
kalencaya and wangqi authored Aug 14, 2024
1 parent f53db8b commit d2e15f9
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 53 deletions.
9 changes: 9 additions & 0 deletions tools/kubernetes/flink/values-session.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,22 @@ defaultConfiguration:
kubernetes.operator.reconcile.interval: 15 s
kubernetes.operator.observer.progress-check.interval: 5 s
kubernetes.operator.metrics.reporter.prom.factory.class: org.apache.flink.metrics.prometheus.PrometheusReporterFactory
kubernetes.operator.metrics.reporter.prom.port: 9999
s3.endpoint: http://${IP}:9000
s3.access-key: admin
s3.secret-key: password
s3.path.style.access: true
# (Optional) Exposes metrics port on the container if defined
metrics:
port: 9999

# set TimeZone from UTC to Asia/Shanghai
operatorPod:
env:
- name: TZ
value: Asia/Shanghai


22 changes: 22 additions & 0 deletions tools/kubernetes/flink/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,25 @@ operatorPod:
env:
- name: TZ
value: Asia/Shanghai

defaultConfiguration:
# If set to true,
# (1) loads the built-in default configuration
# (2) appends the below flink-conf and logging configuration overrides
# If set to false, loads just the overrides as in (2).
# This option has not effect, if create is equal to false.
append: true
flink-conf.yaml: |+
# Flink Config Overrides
kubernetes.operator.metrics.reporter.slf4j.factory.class: org.apache.flink.metrics.slf4j.Slf4jReporterFactory
kubernetes.operator.metrics.reporter.slf4j.interval: 5 MINUTE
kubernetes.operator.reconcile.interval: 15 s
kubernetes.operator.observer.progress-check.interval: 5 s
kubernetes.operator.metrics.reporter.prom.factory.class: org.apache.flink.metrics.prometheus.PrometheusReporterFactory
kubernetes.operator.metrics.reporter.prom.port: 9999
# (Optional) Exposes metrics port on the container if defined
metrics:
port: 9999
44 changes: 44 additions & 0 deletions tools/kubernetes/prometheus/values-prometheus-operator-backup.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

alertmanager:
service:
## Port for Alertmanager Service to listen on
port: 9093
## To be used with a proxy extraContainer port
targetPort: 9093
## Port to expose on each node
## Only used if service.type is 'NodePort'
nodePort: 30903
## Service type
type: NodePort
config:
route:
group_by: ['namespace']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'wechat'
routes:
- receiver: 'wechat'
matchers:
- alertname = "Watchdog"
receivers:
- name: 'wechat'
webhook-configs:
send-resolved: true
# 替换企微机器人的真正key
# url: https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=<wechat-robot-key>
99 changes: 46 additions & 53 deletions tools/kubernetes/prometheus/values-prometheus-operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,28 @@ defaultRules:
## Configuration for alertmanager
## ref: https://prometheus.io/docs/alerting/alertmanager/
alertmanager:
## Deploy alertmanager
enabled: true
## Alertmanager configuration directives
## ref: https://prometheus.io/docs/alerting/configuration/#configuration-file
## https://prometheus.io/webtools/alerting/routing-tree-editor/
config:
route:
group_by: ['namespace']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'webhook'
routes:
- receiver: 'webhook'
matchers:
- alertname="Watchdog"
receivers:
- name: 'webhook'
webhook_configs:
- send_resolved: false
url: http://10.10.18.163:8080/carp/api/carp/alert

## Configuration for Alertmanager service
service:
## Port for Alertmanager Service to listen on
Expand All @@ -39,52 +60,6 @@ alertmanager:
serviceMonitor:
## If true, a ServiceMonitor will be created for the AlertManager service.
selfMonitor: false
## Alertmanager configuration directives
## ref: https://prometheus.io/docs/alerting/configuration/#configuration-file
## https://prometheus.io/webtools/alerting/routing-tree-editor/
config:
inhibit_rules:
- source_matchers:
- 'severity = critical'
target_matchers:
- 'severity =~ warning|info'
equal:
- 'namespace'
- 'alertname'
- source_matchers:
- 'severity = warning'
target_matchers:
- 'severity = info'
equal:
- 'namespace'
- 'alertname'
- source_matchers:
- 'alertname = InfoInhibitor'
target_matchers:
- 'severity = info'
equal:
- 'namespace'
- target_matchers:
- 'alertname = InfoInhibitor'
route:
group_by: ['namespace']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'wechat'
routes:
- receiver: 'wechat'
matchers:
- alertname = "Watchdog"
receivers:
- name: 'wechat'
webhook-configs:
send-resolved: true
# 替换企微机器人的真正key
# url: https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=<wechat-robot-key>

templates:
- '/etc/alertmanager/config/*.tmpl'

## Using default values from https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml
## Grafana isn't managed by prometheus
Expand Down Expand Up @@ -262,17 +237,32 @@ prometheus:
interval: 10s
## HTTP path to scrape for metrics
# path: /metrics


additionalPodMonitors: []
# - name: flink-kubernetes-operator-service-metrics
# additionalLabels:
# group: flink-kubernetes-operator-metrics
# platform: scaleph
## todo 后续在 scaleph 可以增加更多的 labels 到 flink pods 中,通过这种方式传到 prometheus 和 grafana 中去
# podTargetLabels: []
# selector:
# matchLabels:
# app: flink-kubernetes-operator-metrics
# component: metrics
# platform: scaleph
# endpoints:
# - port: prom-metrics
# interval: 10s

additionalPodMonitors:
## Name of the PodMonitor to create
##
# - name: ""
- name: "flink-kubernetes-operator-metrics"

## Additional labels to set used for the PodMonitorSelector. Together with standard labels from
## the chart
##
# additionalLabels: {}
additionalLabels:
group: flink-kubernetes-operator-pod-metrics
platform: scaleph

## Pod label for use in assembling a job name of the form <label value>-<port>
## If no label is specified, the pod endpoint name is used.
Expand All @@ -281,7 +271,9 @@ prometheus:

## Label selector for pods to which this PodMonitor applies
##
# selector: {}
selector:
matchLabels:
app.kubernetes.io/name: flink-kubernetes-operator

## PodTargetLabels transfers labels on the Kubernetes Pod onto the target.
##
Expand All @@ -305,4 +297,5 @@ prometheus:
## Endpoints of the selected pods to be monitored
## https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#podmetricsendpoint
##
# podMetricsEndpoints: []
podMetricsEndpoints:
- port: metrics

0 comments on commit d2e15f9

Please sign in to comment.