You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: monitoring/prometheus/rules.yml
+2-14Lines changed: 2 additions & 14 deletions
Original file line number
Diff line number
Diff line change
@@ -23,26 +23,14 @@ groups:
23
23
severity: major
24
24
group: 'instance'
25
25
26
-
- name: EvalAI-Instance-Status
27
-
rules:
28
-
- alert: InstanceDown
29
-
expr: up{job="evalai"} == 0
30
-
for: 5m
31
-
annotations:
32
-
title: "EvalAI is down"
33
-
description: "*{{ $labels.job }}* on *{{ $labels.instance }}* has been down for more than 5 minutes"
34
-
labels:
35
-
severity: major
36
-
group: 'instance'
37
-
38
26
- name: Worker-Down
39
27
rules:
40
28
- alert: WorkerDown
41
-
expr: ((count by (queue_name) (num_submissions_in_queue{is_remote="0"})) - (count by (queue_name) (num_processed_submissions{is_remote="0"}))) >= 5
29
+
expr: ((sum by (queue_name) (num_submissions_in_queue{is_remote="0"})) - (sum by (queue_name) (num_processed_submissions{is_remote="0"}))) or sum by(queue_name)(num_submissions_in_queue{is_remote="0"}) >= 5
42
30
for: 120m
43
31
annotations:
44
32
title: "Worker is down"
45
-
description: "*{{ $labels.queue_name }}* worker is not processing submissions"
33
+
description: "*{{ $labels.queue_name }}* worker on *{{ $labels.instance }}* is not processing submissions"
0 commit comments