CrunchyData
diff --git a/‎changelogs/fragments/461.yml‎
Lines changed: 4 additions & 0 deletions b/‎changelogs/fragments/461.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎prometheus/common/alert-rules.d/crunchy-alert-rules-common.yml‎
Lines changed: 21 additions & 0 deletions b/‎prometheus/common/alert-rules.d/crunchy-alert-rules-common.yml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎prometheus/common/alert-rules.d/crunchy-alert-rules-etcd.yml.example‎
Lines changed: 0 additions & 33 deletions b/‎prometheus/common/alert-rules.d/crunchy-alert-rules-etcd.yml.example‎
Lines changed: 0 additions & 33 deletions
diff --git a/‎prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example‎
Lines changed: 0 additions & 12 deletions b/‎prometheus/common/alert-rules.d/crunchy-alert-rules-pg.yml.example‎
Lines changed: 0 additions & 12 deletions
@@ -0,0 +1,4 @@
+---
+minor_changes:
+ - prometheus - Remove unnecessary absence alerts. The general ExporterDown metric can cover these scenarios
+ - prometheus - Moved the ExporterDown alert to its own common alerts file and have it be enabled by default (no .example extension on the file name)
@@ -0,0 +1,21 @@
+###
+#
+# Copyright © 2017-2025 Crunchy Data Solutions, Inc. All Rights Reserved.
+#
+###
+
+groups:
+- name: alert-rules
+ rules:
+
+########## COMMON RULES ##########
+ - alert: ExporterDown
+ expr: avg_over_time(up[5m]) < 0.5
+ for: 10s
+ labels:
+ service: system
+ severity: critical
+ severity_num: 300
+ annotations:
+ description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.'
+ summary: 'Prometheus Exporter Service Down'
@@ -56,36 +56,3 @@ groups:
 # severity_num: 300
 # annotations:
 # description: 'The expected minimum count of etcd nodes was not found. Current count {{ $value }}'
-
-# Absence alerts must be configured per named job, otherwise there's no way to know which job is down
-# Below is are some examples using the leader metric for a targets called "etcd#" for a 3 node etcd cluster
-
-# - alert: ETCDAbsent_etcd1
-# expr: absent(etcd_server_has_leader{job="ip11_etcd1"})
-# for: 10s
-# labels:
-# service: etcd
-# severity: critical
-# severity_num: 300
-# annotations:
-# description: 'Leader metric is absent from target {{ $labels.job }}. Check that etcd is running on target host.'
-
-# - alert: ETCDAbsent_etcd2
-# expr: absent(etcd_server_has_leader{job="ip21_etcd2"})
-# for: 10s
-# labels:
-# service: etcd
-# severity: critical
-# severity_num: 300
-# annotations:
-# description: 'Leader metric is absent from target {{ $labels.job }}. Check that etcd is running on target host.'
-
-# - alert: ETCDAbsent_etcd3
-# expr: absent(etcd_server_has_leader{job="ip31_etcd3"})
-# for: 10s
-# labels:
-# service: etcd
-# severity: critical
-# severity_num: 300
-# annotations:
-# description: 'Leader metric is absent from target {{ $labels.job }}. Check that etcd is running on target host.'
@@ -164,18 +164,6 @@ groups:
 # summary: '{{ $labels.job }} has changed from replica to primary'
 
 
-## Absence alerts must be configured per named job, otherwise there's no way to know which job is down
-## Below is an example for a target job called "Prod"
-# - alert: PGConnectionAbsent_Prod
-# expr: absent(ccp_connection_stats_max_connections{job="Prod"})
-# for: 10s
-# labels:
-# service: postgresql
-# severity: critical
-# severity_num: 300
-# annotations:
-# description: 'Connection metric is absent from target (Prod). Check that postgres_exporter can connect to PostgreSQL.'
-
 
 ## Optional monitor for changes to pg_settings (postgresql.conf) system catalog.
 ## A similar metric is available for monitoring pg_hba.conf. See ccp_hba_settings_checksum.