Skip to content

Commit 86d85cd

Browse files
authored
[Monitoring] Remove staging and prod probe alerts (#3642)
1 parent ef4b248 commit 86d85cd

File tree

3 files changed

+2
-24
lines changed

3 files changed

+2
-24
lines changed

monitoring/prometheus/prometheus_production.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,6 @@ scrape_configs:
2222
static_configs:
2323
- targets: ['eval.ai']
2424

25-
- job_name: 'evalai'
26-
metrics_path: '/'
27-
static_configs:
28-
- targets: ['eval.ai']
29-
3025
alerting:
3126
alertmanagers:
3227
- path_prefix: '/alert_manager'

monitoring/prometheus/prometheus_staging.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,6 @@ scrape_configs:
2222
static_configs:
2323
- targets: ['staging.eval.ai']
2424

25-
- job_name: 'evalai'
26-
metrics_path: '/'
27-
static_configs:
28-
- targets: ['staging.eval.ai']
29-
3025
alerting:
3126
alertmanagers:
3227
- path_prefix: '/alert_manager'

monitoring/prometheus/rules.yml

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,26 +23,14 @@ groups:
2323
severity: major
2424
group: 'instance'
2525

26-
- name: EvalAI-Instance-Status
27-
rules:
28-
- alert: InstanceDown
29-
expr: up{job="evalai"} == 0
30-
for: 5m
31-
annotations:
32-
title: "EvalAI is down"
33-
description: "*{{ $labels.job }}* on *{{ $labels.instance }}* has been down for more than 5 minutes"
34-
labels:
35-
severity: major
36-
group: 'instance'
37-
3826
- name: Worker-Down
3927
rules:
4028
- alert: WorkerDown
41-
expr: ((count by (queue_name) (num_submissions_in_queue{is_remote="0"})) - (count by (queue_name) (num_processed_submissions{is_remote="0"}))) >= 5
29+
expr: ((sum by (queue_name) (num_submissions_in_queue{is_remote="0"})) - (sum by (queue_name) (num_processed_submissions{is_remote="0"}))) or sum by(queue_name)(num_submissions_in_queue{is_remote="0"}) >= 5
4230
for: 120m
4331
annotations:
4432
title: "Worker is down"
45-
description: "*{{ $labels.queue_name }}* worker is not processing submissions"
33+
description: "*{{ $labels.queue_name }}* worker on *{{ $labels.instance }}* is not processing submissions"
4634
labels:
4735
severity: major
4836
group: 'queue_name'

0 commit comments

Comments
 (0)