Add new alert architecture

This commit is contained in:
Urtzi Alfaro
2025-08-23 10:19:58 +02:00
parent 1a9839240e
commit 4b4268d640
45 changed files with 6518 additions and 1590 deletions

View File

@@ -0,0 +1,243 @@
# infrastructure/monitoring/prometheus/rules/alert-system-rules.yml
# Prometheus alerting rules for the Bakery Alert and Recommendation System
groups:
- name: alert_system_health
rules:
# System component health alerts
- alert: AlertSystemComponentDown
expr: alert_system_component_health == 0
for: 2m
labels:
severity: critical
service: "{{ $labels.service }}"
component: "{{ $labels.component }}"
annotations:
summary: "Alert system component {{ $labels.component }} is unhealthy"
description: "Component {{ $labels.component }} in service {{ $labels.service }} has been unhealthy for more than 2 minutes."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#component-health"
# Connection health alerts
- alert: RabbitMQConnectionDown
expr: alert_rabbitmq_connection_status == 0
for: 1m
labels:
severity: critical
service: "{{ $labels.service }}"
annotations:
summary: "RabbitMQ connection down for {{ $labels.service }}"
description: "Service {{ $labels.service }} has lost connection to RabbitMQ for more than 1 minute."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#rabbitmq-connection"
- alert: RedisConnectionDown
expr: alert_redis_connection_status == 0
for: 1m
labels:
severity: critical
service: "{{ $labels.service }}"
annotations:
summary: "Redis connection down for {{ $labels.service }}"
description: "Service {{ $labels.service }} has lost connection to Redis for more than 1 minute."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#redis-connection"
# Leader election issues
- alert: NoSchedulerLeader
expr: sum(alert_scheduler_leader_status) == 0
for: 5m
labels:
severity: warning
annotations:
summary: "No scheduler leader elected"
description: "No service has been elected as scheduler leader for more than 5 minutes. Scheduled checks may not be running."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#leader-election"
- name: alert_system_performance
rules:
# High error rates
- alert: HighAlertProcessingErrorRate
expr: rate(alert_processing_errors_total[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "High alert processing error rate"
description: "Alert processing error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-errors"
- alert: HighNotificationDeliveryFailureRate
expr: rate(alert_delivery_failures_total[5m]) / rate(alert_notifications_sent_total[5m]) > 0.05
for: 3m
labels:
severity: warning
channel: "{{ $labels.channel }}"
annotations:
summary: "High notification delivery failure rate for {{ $labels.channel }}"
description: "Notification delivery failure rate for {{ $labels.channel }} is {{ $value | humanizePercentage }} over the last 5 minutes."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#delivery-failures"
# Processing latency
- alert: HighAlertProcessingLatency
expr: histogram_quantile(0.95, rate(alert_processing_duration_seconds_bucket[5m])) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "High alert processing latency"
description: "95th percentile alert processing latency is {{ $value }}s, exceeding 5s threshold."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-latency"
# SSE connection issues
- alert: TooManySSEConnections
expr: sum(alert_sse_active_connections) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: "Too many active SSE connections"
description: "Number of active SSE connections ({{ $value }}) exceeds 1000. This may impact performance."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-connections"
- alert: SSEConnectionErrors
expr: rate(alert_sse_connection_errors_total[5m]) > 0.5
for: 3m
labels:
severity: warning
annotations:
summary: "High SSE connection error rate"
description: "SSE connection error rate is {{ $value }} errors/second over the last 5 minutes."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-errors"
- name: alert_system_business
rules:
# Alert volume anomalies
- alert: UnusuallyHighAlertVolume
expr: rate(alert_items_published_total{item_type="alert"}[10m]) > 2
for: 5m
labels:
severity: warning
service: "{{ $labels.service }}"
annotations:
summary: "Unusually high alert volume from {{ $labels.service }}"
description: "Service {{ $labels.service }} is generating alerts at {{ $value }} alerts/second, which is above normal levels."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#high-volume"
- alert: NoAlertsGenerated
expr: rate(alert_items_published_total[30m]) == 0
for: 15m
labels:
severity: warning
annotations:
summary: "No alerts generated recently"
description: "No alerts have been generated in the last 30 minutes. This may indicate a problem with detection systems."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#no-alerts"
# Response time issues
- alert: SlowAlertResponseTime
expr: histogram_quantile(0.95, rate(alert_item_response_time_seconds_bucket[1h])) > 3600
for: 10m
labels:
severity: warning
annotations:
summary: "Slow alert response times"
description: "95th percentile alert response time is {{ $value | humanizeDuration }}, exceeding 1 hour."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#response-times"
# Critical alerts not acknowledged
- alert: CriticalAlertsUnacknowledged
expr: sum(alert_active_items_current{item_type="alert",severity="urgent"}) > 5
for: 10m
labels:
severity: critical
annotations:
summary: "Multiple critical alerts unacknowledged"
description: "{{ $value }} critical alerts remain unacknowledged for more than 10 minutes."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#critical-unacked"
- name: alert_system_capacity
rules:
# Queue size monitoring
- alert: LargeSSEMessageQueues
expr: alert_sse_message_queue_size > 100
for: 5m
labels:
severity: warning
tenant_id: "{{ $labels.tenant_id }}"
annotations:
summary: "Large SSE message queue for tenant {{ $labels.tenant_id }}"
description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages, indicating potential client issues."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-queues"
# Database storage issues
- alert: SlowDatabaseStorage
expr: histogram_quantile(0.95, rate(alert_database_storage_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Slow database storage for alerts"
description: "95th percentile database storage time is {{ $value }}s, exceeding 1s threshold."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#database-storage"
- name: alert_system_effectiveness
rules:
# False positive rate monitoring
- alert: HighFalsePositiveRate
expr: alert_false_positive_rate > 0.2
for: 30m
labels:
severity: warning
service: "{{ $labels.service }}"
alert_type: "{{ $labels.alert_type }}"
annotations:
summary: "High false positive rate for {{ $labels.alert_type }}"
description: "False positive rate for {{ $labels.alert_type }} in {{ $labels.service }} is {{ $value | humanizePercentage }}."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#false-positives"
# Low recommendation adoption
- alert: LowRecommendationAdoption
expr: rate(alert_recommendations_implemented_total[24h]) / rate(alert_items_published_total{item_type="recommendation"}[24h]) < 0.1
for: 1h
labels:
severity: info
service: "{{ $labels.service }}"
annotations:
summary: "Low recommendation adoption rate"
description: "Recommendation adoption rate for {{ $labels.service }} is {{ $value | humanizePercentage }} over the last 24 hours."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#recommendation-adoption"
# Additional alerting rules for specific scenarios
- name: alert_system_critical_scenarios
rules:
# Complete system failure
- alert: AlertSystemDown
expr: up{job=~"alert-processor|notification-service"} == 0
for: 1m
labels:
severity: critical
service: "{{ $labels.job }}"
annotations:
summary: "Alert system service {{ $labels.job }} is down"
description: "Critical alert system service {{ $labels.job }} has been down for more than 1 minute."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#service-down"
# Data loss prevention
- alert: AlertDataNotPersisted
expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_database_storage_duration_seconds_count[5m]) == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Alert data not being persisted to database"
description: "Alerts are being processed but not stored in database, potential data loss."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#data-persistence"
# Notification blackhole
- alert: NotificationsNotDelivered
expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_notifications_sent_total[5m]) == 0
for: 3m
labels:
severity: critical
annotations:
summary: "Notifications not being delivered"
description: "Alerts are being processed but no notifications are being sent."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#notification-delivery"