243 lines
11 KiB
YAML
243 lines
11 KiB
YAML
|
|
# infrastructure/monitoring/prometheus/rules/alert-system-rules.yml
|
||
|
|
# Prometheus alerting rules for the Bakery Alert and Recommendation System
|
||
|
|
|
||
|
|
groups:
|
||
|
|
- name: alert_system_health
|
||
|
|
rules:
|
||
|
|
# System component health alerts
|
||
|
|
- alert: AlertSystemComponentDown
|
||
|
|
expr: alert_system_component_health == 0
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
service: "{{ $labels.service }}"
|
||
|
|
component: "{{ $labels.component }}"
|
||
|
|
annotations:
|
||
|
|
summary: "Alert system component {{ $labels.component }} is unhealthy"
|
||
|
|
description: "Component {{ $labels.component }} in service {{ $labels.service }} has been unhealthy for more than 2 minutes."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#component-health"
|
||
|
|
|
||
|
|
# Connection health alerts
|
||
|
|
- alert: RabbitMQConnectionDown
|
||
|
|
expr: alert_rabbitmq_connection_status == 0
|
||
|
|
for: 1m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
service: "{{ $labels.service }}"
|
||
|
|
annotations:
|
||
|
|
summary: "RabbitMQ connection down for {{ $labels.service }}"
|
||
|
|
description: "Service {{ $labels.service }} has lost connection to RabbitMQ for more than 1 minute."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#rabbitmq-connection"
|
||
|
|
|
||
|
|
- alert: RedisConnectionDown
|
||
|
|
expr: alert_redis_connection_status == 0
|
||
|
|
for: 1m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
service: "{{ $labels.service }}"
|
||
|
|
annotations:
|
||
|
|
summary: "Redis connection down for {{ $labels.service }}"
|
||
|
|
description: "Service {{ $labels.service }} has lost connection to Redis for more than 1 minute."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#redis-connection"
|
||
|
|
|
||
|
|
# Leader election issues
|
||
|
|
- alert: NoSchedulerLeader
|
||
|
|
expr: sum(alert_scheduler_leader_status) == 0
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "No scheduler leader elected"
|
||
|
|
description: "No service has been elected as scheduler leader for more than 5 minutes. Scheduled checks may not be running."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#leader-election"
|
||
|
|
|
||
|
|
- name: alert_system_performance
|
||
|
|
rules:
|
||
|
|
# High error rates
|
||
|
|
- alert: HighAlertProcessingErrorRate
|
||
|
|
expr: rate(alert_processing_errors_total[5m]) > 0.1
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High alert processing error rate"
|
||
|
|
description: "Alert processing error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-errors"
|
||
|
|
|
||
|
|
- alert: HighNotificationDeliveryFailureRate
|
||
|
|
expr: rate(alert_delivery_failures_total[5m]) / rate(alert_notifications_sent_total[5m]) > 0.05
|
||
|
|
for: 3m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
channel: "{{ $labels.channel }}"
|
||
|
|
annotations:
|
||
|
|
summary: "High notification delivery failure rate for {{ $labels.channel }}"
|
||
|
|
description: "Notification delivery failure rate for {{ $labels.channel }} is {{ $value | humanizePercentage }} over the last 5 minutes."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#delivery-failures"
|
||
|
|
|
||
|
|
# Processing latency
|
||
|
|
- alert: HighAlertProcessingLatency
|
||
|
|
expr: histogram_quantile(0.95, rate(alert_processing_duration_seconds_bucket[5m])) > 5
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High alert processing latency"
|
||
|
|
description: "95th percentile alert processing latency is {{ $value }}s, exceeding 5s threshold."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-latency"
|
||
|
|
|
||
|
|
# SSE connection issues
|
||
|
|
- alert: TooManySSEConnections
|
||
|
|
expr: sum(alert_sse_active_connections) > 1000
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "Too many active SSE connections"
|
||
|
|
description: "Number of active SSE connections ({{ $value }}) exceeds 1000. This may impact performance."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-connections"
|
||
|
|
|
||
|
|
- alert: SSEConnectionErrors
|
||
|
|
expr: rate(alert_sse_connection_errors_total[5m]) > 0.5
|
||
|
|
for: 3m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High SSE connection error rate"
|
||
|
|
description: "SSE connection error rate is {{ $value }} errors/second over the last 5 minutes."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-errors"
|
||
|
|
|
||
|
|
- name: alert_system_business
|
||
|
|
rules:
|
||
|
|
# Alert volume anomalies
|
||
|
|
- alert: UnusuallyHighAlertVolume
|
||
|
|
expr: rate(alert_items_published_total{item_type="alert"}[10m]) > 2
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
service: "{{ $labels.service }}"
|
||
|
|
annotations:
|
||
|
|
summary: "Unusually high alert volume from {{ $labels.service }}"
|
||
|
|
description: "Service {{ $labels.service }} is generating alerts at {{ $value }} alerts/second, which is above normal levels."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#high-volume"
|
||
|
|
|
||
|
|
- alert: NoAlertsGenerated
|
||
|
|
expr: rate(alert_items_published_total[30m]) == 0
|
||
|
|
for: 15m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "No alerts generated recently"
|
||
|
|
description: "No alerts have been generated in the last 30 minutes. This may indicate a problem with detection systems."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#no-alerts"
|
||
|
|
|
||
|
|
# Response time issues
|
||
|
|
- alert: SlowAlertResponseTime
|
||
|
|
expr: histogram_quantile(0.95, rate(alert_item_response_time_seconds_bucket[1h])) > 3600
|
||
|
|
for: 10m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "Slow alert response times"
|
||
|
|
description: "95th percentile alert response time is {{ $value | humanizeDuration }}, exceeding 1 hour."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#response-times"
|
||
|
|
|
||
|
|
# Critical alerts not acknowledged
|
||
|
|
- alert: CriticalAlertsUnacknowledged
|
||
|
|
expr: sum(alert_active_items_current{item_type="alert",severity="urgent"}) > 5
|
||
|
|
for: 10m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
annotations:
|
||
|
|
summary: "Multiple critical alerts unacknowledged"
|
||
|
|
description: "{{ $value }} critical alerts remain unacknowledged for more than 10 minutes."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#critical-unacked"
|
||
|
|
|
||
|
|
- name: alert_system_capacity
|
||
|
|
rules:
|
||
|
|
# Queue size monitoring
|
||
|
|
- alert: LargeSSEMessageQueues
|
||
|
|
expr: alert_sse_message_queue_size > 100
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
tenant_id: "{{ $labels.tenant_id }}"
|
||
|
|
annotations:
|
||
|
|
summary: "Large SSE message queue for tenant {{ $labels.tenant_id }}"
|
||
|
|
description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages, indicating potential client issues."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-queues"
|
||
|
|
|
||
|
|
# Database storage issues
|
||
|
|
- alert: SlowDatabaseStorage
|
||
|
|
expr: histogram_quantile(0.95, rate(alert_database_storage_duration_seconds_bucket[5m])) > 1
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "Slow database storage for alerts"
|
||
|
|
description: "95th percentile database storage time is {{ $value }}s, exceeding 1s threshold."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#database-storage"
|
||
|
|
|
||
|
|
- name: alert_system_effectiveness
|
||
|
|
rules:
|
||
|
|
# False positive rate monitoring
|
||
|
|
- alert: HighFalsePositiveRate
|
||
|
|
expr: alert_false_positive_rate > 0.2
|
||
|
|
for: 30m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
service: "{{ $labels.service }}"
|
||
|
|
alert_type: "{{ $labels.alert_type }}"
|
||
|
|
annotations:
|
||
|
|
summary: "High false positive rate for {{ $labels.alert_type }}"
|
||
|
|
description: "False positive rate for {{ $labels.alert_type }} in {{ $labels.service }} is {{ $value | humanizePercentage }}."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#false-positives"
|
||
|
|
|
||
|
|
# Low recommendation adoption
|
||
|
|
- alert: LowRecommendationAdoption
|
||
|
|
expr: rate(alert_recommendations_implemented_total[24h]) / rate(alert_items_published_total{item_type="recommendation"}[24h]) < 0.1
|
||
|
|
for: 1h
|
||
|
|
labels:
|
||
|
|
severity: info
|
||
|
|
service: "{{ $labels.service }}"
|
||
|
|
annotations:
|
||
|
|
summary: "Low recommendation adoption rate"
|
||
|
|
description: "Recommendation adoption rate for {{ $labels.service }} is {{ $value | humanizePercentage }} over the last 24 hours."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#recommendation-adoption"
|
||
|
|
|
||
|
|
# Additional alerting rules for specific scenarios
|
||
|
|
- name: alert_system_critical_scenarios
|
||
|
|
rules:
|
||
|
|
# Complete system failure
|
||
|
|
- alert: AlertSystemDown
|
||
|
|
expr: up{job=~"alert-processor|notification-service"} == 0
|
||
|
|
for: 1m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
service: "{{ $labels.job }}"
|
||
|
|
annotations:
|
||
|
|
summary: "Alert system service {{ $labels.job }} is down"
|
||
|
|
description: "Critical alert system service {{ $labels.job }} has been down for more than 1 minute."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#service-down"
|
||
|
|
|
||
|
|
# Data loss prevention
|
||
|
|
- alert: AlertDataNotPersisted
|
||
|
|
expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_database_storage_duration_seconds_count[5m]) == 0
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
annotations:
|
||
|
|
summary: "Alert data not being persisted to database"
|
||
|
|
description: "Alerts are being processed but not stored in database, potential data loss."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#data-persistence"
|
||
|
|
|
||
|
|
# Notification blackhole
|
||
|
|
- alert: NotificationsNotDelivered
|
||
|
|
expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_notifications_sent_total[5m]) == 0
|
||
|
|
for: 3m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
annotations:
|
||
|
|
summary: "Notifications not being delivered"
|
||
|
|
description: "Alerts are being processed but no notifications are being sent."
|
||
|
|
runbook_url: "https://docs.bakery.local/runbooks/alert-system#notification-delivery"
|