Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

告警无法发送到飞书 #1952

Closed
bilbilmyc opened this issue May 15, 2024 · 4 comments
Closed

告警无法发送到飞书 #1952

bilbilmyc opened this issue May 15, 2024 · 4 comments

Comments

@bilbilmyc
Copy link

Your config.toml

[Global]
RunMode = "release"

[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "DEBUG"
# stdout, stderr, file
Output = "file"
# # rotate by time
KeepHours = 24
# # rotate by size
RotateNum = 10
# # unit: MB
RotateSize = 256

[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 17000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = false
# whether enable pprof
PProf = false
# expose prometheus /metrics?
ExposeMetrics = true
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120

[HTTP.ShowCaptcha]
Enable = false

[HTTP.APIForAgent]
Enable = true
# [HTTP.APIForAgent.BasicAuth]
# user001 = "ccc26da7b9aba533cbb263a36c07dcc5"

[HTTP.APIForService]
Enable = true
[HTTP.APIForService.BasicAuth]
user001 = "ccc26da7b9aba533cbb263a36c07dcc5"

[HTTP.JWTAuth]
# signing key
SigningKey = "5b94a0fd640fe2765af826acfe42d151"
# unit: min
AccessExpired = 1500
# unit: min
RefreshExpired = 10080
RedisKeyPrefix = "/jwt/"

[HTTP.ProxyAuth]
# if proxy auth enabled, jwt auth is disabled
Enable = false
# username key in http proxy header
HeaderUserNameKey = "X-User-Name"
DefaultRoles = ["Standard"]

[HTTP.RSA]
# open RSA
OpenRSA = false
# Before replacing the key file, make sure that there are no encrypted variables in the database "configs".
# It is recommended to decrypt and remove all encrypted values from the database before replacing the key file.
# This will prevent any potential issues with accessing or decrypting the variables using the new key file.
# RSA public key (auto carete)
RSAPublicKeyPath = "etc/rsa/public.pem"
# RSA private key (auto carete)
RSAPrivateKeyPath = "etc/rsa/private.pem"
# RSA private key password
RSAPassWord = "n9e@n9e!"

[DB]
# postgres: DSN="host=127.0.0.1 port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable"
DSN="root:1234@tcp(mysql:3306)/n9e_v6?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
# enable debug mode or not
Debug = false
# mysql postgres
DBType = "mysql"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
# enable auto migrate or not
# EnableAutoMigrate = false

[Redis]
# address, ip:port or ip1:port,ip2:port for cluster and sentinel(SentinelAddrs)
Address = "redis:6379"
# Username = ""
# Password = ""
# DB = 0
# UseTLS = false
# TLSMinVersion = "1.2"
# standalone cluster sentinel
RedisType = "standalone"
# Mastername for sentinel type
# MasterName = "mymaster"
# SentinelUsername = ""
# SentinelPassword = ""

[Alert]
[Alert.Heartbeat]
# auto detect if blank
IP = ""
# unit ms
Interval = 1000
EngineName = "default"

# [Alert.Alerting]
# NotifyConcurrency = 10

[Center]
MetricsYamlFile = "./etc/metrics.yaml"
I18NHeaderKey = "X-Language"

[Center.AnonymousAccess]
PromQuerier = false
AlertDetail = false

[Pushgw]
# use target labels in database instead of in series
LabelRewrite = true
# # default busigroup key name
# BusiGroupLabelKey = "busigroup"
ForceUseServerTS = true

# [Pushgw.DebugSample]
# ident = "xx"
# __name__ = "xx"

# [Pushgw.WriterOpt]
# QueueMaxSize = 1000000
# QueuePopSize = 1000

[[Pushgw.Writers]]
# Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write"
Url = "http://victoriametrics:8428/api/v1/write"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Headers = ["X-From", "n9e"]
Timeout = 10000
DialTimeout = 3000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
## Optional TLS Config
# UseTLS = false
# TLSCA = "/etc/n9e/ca.pem"
# TLSCert = "/etc/n9e/cert.pem"
# TLSKey = "/etc/n9e/key.pem"
# InsecureSkipVerify = false
# [[Writers.WriteRelabels]]
# Action = "replace"
# SourceLabels = ["__address__"]
# Regex = "([^:]+)(?::\\d+)?"
# Replacement = "$1:80"
# TargetLabel = "__address__"

[Ibex]
Enable = true
RPCListen = "0.0.0.0:20090"

Relevant logs

DEBUG.log:2024-05-15 17:43:35.540867 DEBUG process/process.go:326 rule_eval:alert-1-52 event:&{Id:0 Cate:prometheus Cluster:VictoriaMetrics-1 DatasourceId:1 GroupId:3 GroupName:博云k8s Hash:bbd7976261129f886d9b8a3da99030bb RuleId:52 RuleName:进程监控-进程总量超过600 RuleNote: RuleProd:metric RuleAlgo: Severity:2 PromForDuration:60 PromQl:processes_total > 600 RuleConfig:{"inhibit":false,"queries":[{"keys":{"labelKey":"","valueKey":""},"prom_ql":"processes_total \u003e 600","severity":2}]} RuleConfigJson:map[inhibit:false queries:[map[keys:map[labelKey: valueKey:] prom_ql:processes_total > 600 severity:2]]] PromEvalInterval:15 Callbacks: CallbacksJSON:[] RunbookUrl: NotifyRecovered:1 NotifyChannels:feishu feishucard NotifyChannelsJSON:[feishu feishucard] NotifyGroups:2 NotifyGroupsJSON:[2] NotifyGroupsObj:[] TargetIdent:g102 TargetNote: TriggerTime:1715766215 TriggerValue:805TriggerValues: Tags:__name__=processes_total,,ident=g102,,rulename=进程监控-进程总量超过600,,source=categraf TagsJSON:[__name__=processes_total ident=g102 rulename=进程监控-进程总量超过600 source=categraf] TagsMap:map[__name__:processes_total ident:g102 rulename:进程监控-进程总量超过600 source:categraf] Annotations:{} AnnotationsJSON:map[] IsRecovered:false NotifyUsersObj:[] LastEvalTime:1715766215 LastSentTime:0 NotifyCurNumber:0 FirstTriggerTime:0 ExtraConfig:<nil> Status:0 Claimant: SubRuleId:0 ExtraInfo:[]} fire

DEBUG.log:2024-05-15 17:43:35.540924 DEBUG process/process.go:326 rule_eval:alert-1-52 event:&{Id:0 Cate:prometheus Cluster:VictoriaMetrics-1 DatasourceId:1 GroupId:3 GroupName:博云k8s Hash:f56075dbb452f3c1be33d595e20b15f5 RuleId:52 RuleName:进程监控-进程总量超过600 RuleNote: RuleProd:metric RuleAlgo: Severity:2 PromForDuration:60 PromQl:processes_total > 600 RuleConfig:{"inhibit":false,"queries":[{"keys":{"labelKey":"","valueKey":""},"prom_ql":"processes_total \u003e 600","severity":2}]} RuleConfigJson:map[inhibit:false queries:[map[keys:map[labelKey: valueKey:] prom_ql:processes_total > 600 severity:2]]] PromEvalInterval:15 Callbacks: CallbacksJSON:[] RunbookUrl: NotifyRecovered:1 NotifyChannels:feishu feishucard NotifyChannelsJSON:[feishu feishucard] NotifyGroups:2 NotifyGroupsJSON:[2] NotifyGroupsObj:[] TargetIdent:g131 TargetNote: TriggerTime:1715766215 TriggerValue:1191 TriggerValues: Tags:__name__=processes_total,,ident=g131,,rulename=进程监控-进程总量超过600,,source=categraf TagsJSON:[__name__=processes_total ident=g131 rulename=进程监控-进程总量超过600 source=categraf] TagsMap:map[__name__:processes_total ident:g131 rulename:进程监控-进程总量超过600 source:categraf] Annotations:{} AnnotationsJSON:map[] IsRecovered:false NotifyUsersObj:[] LastEvalTime:1715766215 LastSentTime:0 NotifyCurNumber:0 FirstTriggerTime:0 ExtraConfig:<nil> Status:0 Claimant: SubRuleId:0 ExtraInfo:[]} fire

DEBUG.log:2024-05-15 17:43:35.540971 DEBUG process/process.go:326 rule_eval:alert-1-52 event:&{Id:0 Cate:prometheus Cluster:VictoriaMetrics-1 DatasourceId:1 GroupId:3 GroupName:博云k8s Hash:c0bec6215adfbf1008b79f6c99496197 RuleId:52 RuleName:进程监控-进程总量超过600 RuleNote: RuleProd:metric RuleAlgo: Severity:2 PromForDuration:60 PromQl:processes_total > 600 RuleConfig:{"inhibit":false,"queries":[{"keys":{"labelKey":"","valueKey":""},"prom_ql":"processes_total \u003e 600","severity":2}]} RuleConfigJson:map[inhibit:false queries:[map[keys:map[labelKey: valueKey:] prom_ql:processes_total > 600 severity:2]]] PromEvalInterval:15 Callbacks: CallbacksJSON:[] RunbookUrl: NotifyRecovered:1 NotifyChannels:feishu feishucard NotifyChannelsJSON:[feishu feishucard] NotifyGroups:2 NotifyGroupsJSON:[2] NotifyGroupsObj:[] TargetIdent:hngpu03 TargetNote: TriggerTime:1715766215 TriggerValue:751 TriggerValues: Tags:__name__=processes_total,,ident=hngpu03,,rulename=进程监控-进程总量超过600,,source=categraf TagsJSON:[__name__=processes_total ident=hngpu03 rulename=进程监控-进程总量超过600 source=categraf] TagsMap:map[__name__:processes_total ident:hngpu03 rulename:进程监控-进程总量超过600 source:categraf] Annotations:{} AnnotationsJSON:map[] IsRecovered:false NotifyUsersObj:[] LastEvalTime:1715766215 LastSentTime:0 NotifyCurNumber:0 FirstTriggerTime:0 ExtraConfig:<nil> Status:0 Claimant: SubRuleId:0 ExtraInfo:[]} fire

System info

Ubuntu20,docker20,docker-compose 2.27.0

Steps to reproduce

  1. 通过docker-compose部署的夜莺

  2. 时序库也是docker-compose安装的vm时序库

  3. 二进制安装的categraf

  4. 安装完成后,可以正常打开平台,导入告警规则后也可以查看到告警信息

  5. 创建飞书机器人,复制webhook地址

  6. 打开平台,人员组织-用户管理-新增,创建了一个用户test-feishu,联系方式选择系统自带的feishu_robot_token,右边的框填写机器人的webhook地址
    image
    image
    image

  7. 团队管理-添加test-feishu用户

  8. 告警规则选择告警组

  9. 告警一直有,但是没有发送告警到飞书群
    image
    image

...

Expected behavior

期望可以发送告警到飞书

Actual behavior

实际生成了告警但是没有发送告警信息

Additional info

No response

@710leo
Copy link
Member

710leo commented May 15, 2024

可以从日志检索下 _sender 关键字,看看有什么线索
grep “_sender” *.log|more

@bilbilmyc
Copy link
Author

bilbilmyc commented May 15, 2024

检索"_sender" 没有任何日志,只有一个smtp的报错,不过我不使用邮箱

root@pt13:/var/log/nightingale# grep -r "_sender"
root@pt13:/var/log/nightingale# ll
total 17640
-rw-r--r-- 1 root root  2096010 May 15 18:07 DEBUG.log
-rw-r--r-- 1 root root 13809199 May 15 18:00 DEBUG.log.2024051517
-rw-r--r-- 1 root root        0 May 15 18:00 ERROR.log
-rw-r--r-- 1 root root        0 May 15 17:17 ERROR.log.2024051517
-rw-r--r-- 1 root root        0 May 15 18:00 FATAL.log
-rw-r--r-- 1 root root        0 May 15 17:17 FATAL.log.2024051517
-rw-r--r-- 1 root root     9630 May 15 18:07 INFO.log
-rw-r--r-- 1 root root   106391 May 15 18:00 INFO.log.2024051517
-rw-r--r-- 1 root root        0 May 15 18:00 WARNING.log
-rw-r--r-- 1 root root       83 May 15 17:17 WARNING.log.2024051517
root@pt13:/var/log/nightingale# grep -r "sender"
WARNING.log.2024051517:2024-05-15 17:17:26.288165 WARNING sender/email.go:138 SMTP configurations invalid

@bilbilmyc
Copy link
Author

告警突然可以发出来了,等晚点我检索一下日志贴出来。

@bilbilmyc
Copy link
Author

重启后解决问题

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants