# infrastructure/monitoring/prometheus/rules/alert-system-rules.yml # Prometheus alerting rules for the Bakery Alert and Recommendation System groups: - name: alert_system_health rules: # System component health alerts - alert: AlertSystemComponentDown expr: alert_system_component_health == 0 for: 2m labels: severity: critical service: "{{ $labels.service }}" component: "{{ $labels.component }}" annotations: summary: "Alert system component {{ $labels.component }} is unhealthy" description: "Component {{ $labels.component }} in service {{ $labels.service }} has been unhealthy for more than 2 minutes." runbook_url: "https://docs.bakery.local/runbooks/alert-system#component-health" # Connection health alerts - alert: RabbitMQConnectionDown expr: alert_rabbitmq_connection_status == 0 for: 1m labels: severity: critical service: "{{ $labels.service }}" annotations: summary: "RabbitMQ connection down for {{ $labels.service }}" description: "Service {{ $labels.service }} has lost connection to RabbitMQ for more than 1 minute." runbook_url: "https://docs.bakery.local/runbooks/alert-system#rabbitmq-connection" - alert: RedisConnectionDown expr: alert_redis_connection_status == 0 for: 1m labels: severity: critical service: "{{ $labels.service }}" annotations: summary: "Redis connection down for {{ $labels.service }}" description: "Service {{ $labels.service }} has lost connection to Redis for more than 1 minute." runbook_url: "https://docs.bakery.local/runbooks/alert-system#redis-connection" # Leader election issues - alert: NoSchedulerLeader expr: sum(alert_scheduler_leader_status) == 0 for: 5m labels: severity: warning annotations: summary: "No scheduler leader elected" description: "No service has been elected as scheduler leader for more than 5 minutes. Scheduled checks may not be running." runbook_url: "https://docs.bakery.local/runbooks/alert-system#leader-election" - name: alert_system_performance rules: # High error rates - alert: HighAlertProcessingErrorRate expr: rate(alert_processing_errors_total[5m]) > 0.1 for: 2m labels: severity: warning annotations: summary: "High alert processing error rate" description: "Alert processing error rate is {{ $value | humanizePercentage }} over the last 5 minutes." runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-errors" - alert: HighNotificationDeliveryFailureRate expr: rate(alert_delivery_failures_total[5m]) / rate(alert_notifications_sent_total[5m]) > 0.05 for: 3m labels: severity: warning channel: "{{ $labels.channel }}" annotations: summary: "High notification delivery failure rate for {{ $labels.channel }}" description: "Notification delivery failure rate for {{ $labels.channel }} is {{ $value | humanizePercentage }} over the last 5 minutes." runbook_url: "https://docs.bakery.local/runbooks/alert-system#delivery-failures" # Processing latency - alert: HighAlertProcessingLatency expr: histogram_quantile(0.95, rate(alert_processing_duration_seconds_bucket[5m])) > 5 for: 5m labels: severity: warning annotations: summary: "High alert processing latency" description: "95th percentile alert processing latency is {{ $value }}s, exceeding 5s threshold." runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-latency" # SSE connection issues - alert: TooManySSEConnections expr: sum(alert_sse_active_connections) > 1000 for: 2m labels: severity: warning annotations: summary: "Too many active SSE connections" description: "Number of active SSE connections ({{ $value }}) exceeds 1000. This may impact performance." runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-connections" - alert: SSEConnectionErrors expr: rate(alert_sse_connection_errors_total[5m]) > 0.5 for: 3m labels: severity: warning annotations: summary: "High SSE connection error rate" description: "SSE connection error rate is {{ $value }} errors/second over the last 5 minutes." runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-errors" - name: alert_system_business rules: # Alert volume anomalies - alert: UnusuallyHighAlertVolume expr: rate(alert_items_published_total{item_type="alert"}[10m]) > 2 for: 5m labels: severity: warning service: "{{ $labels.service }}" annotations: summary: "Unusually high alert volume from {{ $labels.service }}" description: "Service {{ $labels.service }} is generating alerts at {{ $value }} alerts/second, which is above normal levels." runbook_url: "https://docs.bakery.local/runbooks/alert-system#high-volume" - alert: NoAlertsGenerated expr: rate(alert_items_published_total[30m]) == 0 for: 15m labels: severity: warning annotations: summary: "No alerts generated recently" description: "No alerts have been generated in the last 30 minutes. This may indicate a problem with detection systems." runbook_url: "https://docs.bakery.local/runbooks/alert-system#no-alerts" # Response time issues - alert: SlowAlertResponseTime expr: histogram_quantile(0.95, rate(alert_item_response_time_seconds_bucket[1h])) > 3600 for: 10m labels: severity: warning annotations: summary: "Slow alert response times" description: "95th percentile alert response time is {{ $value | humanizeDuration }}, exceeding 1 hour." runbook_url: "https://docs.bakery.local/runbooks/alert-system#response-times" # Critical alerts not acknowledged - alert: CriticalAlertsUnacknowledged expr: sum(alert_active_items_current{item_type="alert",severity="urgent"}) > 5 for: 10m labels: severity: critical annotations: summary: "Multiple critical alerts unacknowledged" description: "{{ $value }} critical alerts remain unacknowledged for more than 10 minutes." runbook_url: "https://docs.bakery.local/runbooks/alert-system#critical-unacked" - name: alert_system_capacity rules: # Queue size monitoring - alert: LargeSSEMessageQueues expr: alert_sse_message_queue_size > 100 for: 5m labels: severity: warning tenant_id: "{{ $labels.tenant_id }}" annotations: summary: "Large SSE message queue for tenant {{ $labels.tenant_id }}" description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages, indicating potential client issues." runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-queues" # Database storage issues - alert: SlowDatabaseStorage expr: histogram_quantile(0.95, rate(alert_database_storage_duration_seconds_bucket[5m])) > 1 for: 5m labels: severity: warning annotations: summary: "Slow database storage for alerts" description: "95th percentile database storage time is {{ $value }}s, exceeding 1s threshold." runbook_url: "https://docs.bakery.local/runbooks/alert-system#database-storage" - name: alert_system_effectiveness rules: # False positive rate monitoring - alert: HighFalsePositiveRate expr: alert_false_positive_rate > 0.2 for: 30m labels: severity: warning service: "{{ $labels.service }}" alert_type: "{{ $labels.alert_type }}" annotations: summary: "High false positive rate for {{ $labels.alert_type }}" description: "False positive rate for {{ $labels.alert_type }} in {{ $labels.service }} is {{ $value | humanizePercentage }}." runbook_url: "https://docs.bakery.local/runbooks/alert-system#false-positives" # Low recommendation adoption - alert: LowRecommendationAdoption expr: rate(alert_recommendations_implemented_total[24h]) / rate(alert_items_published_total{item_type="recommendation"}[24h]) < 0.1 for: 1h labels: severity: info service: "{{ $labels.service }}" annotations: summary: "Low recommendation adoption rate" description: "Recommendation adoption rate for {{ $labels.service }} is {{ $value | humanizePercentage }} over the last 24 hours." runbook_url: "https://docs.bakery.local/runbooks/alert-system#recommendation-adoption" # Additional alerting rules for specific scenarios - name: alert_system_critical_scenarios rules: # Complete system failure - alert: AlertSystemDown expr: up{job=~"alert-processor|notification-service"} == 0 for: 1m labels: severity: critical service: "{{ $labels.job }}" annotations: summary: "Alert system service {{ $labels.job }} is down" description: "Critical alert system service {{ $labels.job }} has been down for more than 1 minute." runbook_url: "https://docs.bakery.local/runbooks/alert-system#service-down" # Data loss prevention - alert: AlertDataNotPersisted expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_database_storage_duration_seconds_count[5m]) == 0 for: 2m labels: severity: critical annotations: summary: "Alert data not being persisted to database" description: "Alerts are being processed but not stored in database, potential data loss." runbook_url: "https://docs.bakery.local/runbooks/alert-system#data-persistence" # Notification blackhole - alert: NotificationsNotDelivered expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_notifications_sent_total[5m]) == 0 for: 3m labels: severity: critical annotations: summary: "Notifications not being delivered" description: "Alerts are being processed but no notifications are being sent." runbook_url: "https://docs.bakery.local/runbooks/alert-system#notification-delivery"