--- apiVersion: v1 kind: ConfigMap metadata: name: prometheus-alert-rules namespace: monitoring data: alert-rules.yml: | groups: # Basic Infrastructure Alerts - name: bakery_services interval: 30s rules: - alert: ServiceDown expr: up{job="bakery-services"} == 0 for: 2m labels: severity: critical component: infrastructure annotations: summary: "Service {{ $labels.service }} is down" description: "Service {{ $labels.service }} in namespace {{ $labels.namespace }} has been down for more than 2 minutes." runbook_url: "https://runbooks.bakery-ia.local/ServiceDown" - alert: HighErrorRate expr: | ( sum(rate(http_requests_total{status_code=~"5..", job="bakery-services"}[5m])) by (service) / sum(rate(http_requests_total{job="bakery-services"}[5m])) by (service) ) > 0.10 for: 5m labels: severity: critical component: application annotations: summary: "High error rate on {{ $labels.service }}" description: "Service {{ $labels.service }} has error rate above 10% (current: {{ $value | humanizePercentage }})." runbook_url: "https://runbooks.bakery-ia.local/HighErrorRate" - alert: HighResponseTime expr: | histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job="bakery-services"}[5m])) by (service, le) ) > 1 for: 5m labels: severity: warning component: performance annotations: summary: "High response time on {{ $labels.service }}" description: "Service {{ $labels.service }} P95 latency is above 1 second (current: {{ $value }}s)." runbook_url: "https://runbooks.bakery-ia.local/HighResponseTime" - alert: HighMemoryUsage expr: | container_memory_usage_bytes{namespace="bakery-ia", container!=""} > 500000000 for: 5m labels: severity: warning component: infrastructure annotations: summary: "High memory usage in {{ $labels.pod }}" description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using more than 500MB of memory (current: {{ $value | humanize }}B)." runbook_url: "https://runbooks.bakery-ia.local/HighMemoryUsage" - alert: DatabaseConnectionHigh expr: | pg_stat_database_numbackends{datname="bakery"} > 80 for: 5m labels: severity: warning component: database annotations: summary: "High database connection count" description: "Database has more than 80 active connections (current: {{ $value }})." runbook_url: "https://runbooks.bakery-ia.local/DatabaseConnectionHigh" # Business Logic Alerts - name: bakery_business interval: 30s rules: - alert: TrainingJobFailed expr: | increase(training_job_failures_total[1h]) > 0 for: 5m labels: severity: warning component: ml-training annotations: summary: "Training job failures detected" description: "{{ $value }} training job(s) failed in the last hour." runbook_url: "https://runbooks.bakery-ia.local/TrainingJobFailed" - alert: LowPredictionAccuracy expr: | prediction_model_accuracy < 0.70 for: 15m labels: severity: warning component: ml-inference annotations: summary: "Model prediction accuracy is low" description: "Model {{ $labels.model_name }} accuracy is below 70% (current: {{ $value | humanizePercentage }})." runbook_url: "https://runbooks.bakery-ia.local/LowPredictionAccuracy" - alert: APIRateLimitHit expr: | increase(rate_limit_hits_total[5m]) > 10 for: 5m labels: severity: info component: api-gateway annotations: summary: "API rate limits being hit frequently" description: "Rate limits hit {{ $value }} times in the last 5 minutes." runbook_url: "https://runbooks.bakery-ia.local/APIRateLimitHit" # Alert System Health - name: alert_system_health interval: 30s rules: - alert: AlertSystemComponentDown expr: | alert_system_component_health{component=~"processor|notifier|scheduler"} == 0 for: 2m labels: severity: critical component: alert-system annotations: summary: "Alert system component {{ $labels.component }} is unhealthy" description: "Component {{ $labels.component }} has been unhealthy for more than 2 minutes." runbook_url: "https://runbooks.bakery-ia.local/AlertSystemComponentDown" - alert: RabbitMQConnectionDown expr: | rabbitmq_up == 0 for: 1m labels: severity: critical component: alert-system annotations: summary: "RabbitMQ connection is down" description: "Alert system has lost connection to RabbitMQ message queue." runbook_url: "https://runbooks.bakery-ia.local/RabbitMQConnectionDown" - alert: RedisConnectionDown expr: | redis_up == 0 for: 1m labels: severity: critical component: alert-system annotations: summary: "Redis connection is down" description: "Alert system has lost connection to Redis cache." runbook_url: "https://runbooks.bakery-ia.local/RedisConnectionDown" - alert: NoSchedulerLeader expr: | sum(alert_system_scheduler_leader) == 0 for: 5m labels: severity: warning component: alert-system annotations: summary: "No alert scheduler leader elected" description: "No scheduler instance has been elected as leader for 5 minutes." runbook_url: "https://runbooks.bakery-ia.local/NoSchedulerLeader" # Alert System Performance - name: alert_system_performance interval: 30s rules: - alert: HighAlertProcessingErrorRate expr: | ( sum(rate(alert_processing_errors_total[2m])) / sum(rate(alerts_processed_total[2m])) ) > 0.10 for: 2m labels: severity: critical component: alert-system annotations: summary: "High alert processing error rate" description: "Alert processing error rate is above 10% (current: {{ $value | humanizePercentage }})." runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingErrorRate" - alert: HighNotificationDeliveryFailureRate expr: | ( sum(rate(notification_delivery_failures_total[3m])) / sum(rate(notifications_sent_total[3m])) ) > 0.05 for: 3m labels: severity: warning component: alert-system annotations: summary: "High notification delivery failure rate" description: "Notification delivery failure rate is above 5% (current: {{ $value | humanizePercentage }})." runbook_url: "https://runbooks.bakery-ia.local/HighNotificationDeliveryFailureRate" - alert: HighAlertProcessingLatency expr: | histogram_quantile(0.95, sum(rate(alert_processing_duration_seconds_bucket[5m])) by (le) ) > 5 for: 5m labels: severity: warning component: alert-system annotations: summary: "High alert processing latency" description: "P95 alert processing latency is above 5 seconds (current: {{ $value }}s)." runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingLatency" - alert: TooManySSEConnections expr: | sse_active_connections > 1000 for: 2m labels: severity: warning component: alert-system annotations: summary: "Too many active SSE connections" description: "More than 1000 active SSE connections (current: {{ $value }})." runbook_url: "https://runbooks.bakery-ia.local/TooManySSEConnections" - alert: SSEConnectionErrors expr: | rate(sse_connection_errors_total[3m]) > 0.5 for: 3m labels: severity: warning component: alert-system annotations: summary: "High rate of SSE connection errors" description: "SSE connection error rate is {{ $value }} errors/sec." runbook_url: "https://runbooks.bakery-ia.local/SSEConnectionErrors" # Alert System Business Logic - name: alert_system_business interval: 30s rules: - alert: UnusuallyHighAlertVolume expr: | rate(alerts_generated_total[5m]) > 2 for: 5m labels: severity: warning component: alert-system annotations: summary: "Unusually high alert generation volume" description: "More than 2 alerts per second being generated (current: {{ $value }}/sec)." runbook_url: "https://runbooks.bakery-ia.local/UnusuallyHighAlertVolume" - alert: NoAlertsGenerated expr: | rate(alerts_generated_total[30m]) == 0 for: 15m labels: severity: info component: alert-system annotations: summary: "No alerts generated recently" description: "No alerts have been generated in the last 30 minutes. This might indicate a problem with alert detection." runbook_url: "https://runbooks.bakery-ia.local/NoAlertsGenerated" - alert: SlowAlertResponseTime expr: | histogram_quantile(0.95, sum(rate(alert_response_time_seconds_bucket[10m])) by (le) ) > 3600 for: 10m labels: severity: warning component: alert-system annotations: summary: "Slow alert response times" description: "P95 alert response time is above 1 hour (current: {{ $value | humanizeDuration }})." runbook_url: "https://runbooks.bakery-ia.local/SlowAlertResponseTime" - alert: CriticalAlertsUnacknowledged expr: | sum(alerts_unacknowledged{severity="critical"}) > 5 for: 10m labels: severity: warning component: alert-system annotations: summary: "Multiple critical alerts unacknowledged" description: "{{ $value }} critical alerts have not been acknowledged for 10+ minutes." runbook_url: "https://runbooks.bakery-ia.local/CriticalAlertsUnacknowledged" # Alert System Capacity - name: alert_system_capacity interval: 30s rules: - alert: LargeSSEMessageQueues expr: | sse_message_queue_size > 100 for: 5m labels: severity: warning component: alert-system annotations: summary: "Large SSE message queues detected" description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages queued." runbook_url: "https://runbooks.bakery-ia.local/LargeSSEMessageQueues" - alert: SlowDatabaseStorage expr: | histogram_quantile(0.95, sum(rate(alert_storage_duration_seconds_bucket[5m])) by (le) ) > 1 for: 5m labels: severity: warning component: alert-system annotations: summary: "Slow alert database storage" description: "P95 alert storage latency is above 1 second (current: {{ $value }}s)." runbook_url: "https://runbooks.bakery-ia.local/SlowDatabaseStorage" # Alert System Critical Scenarios - name: alert_system_critical interval: 15s rules: - alert: AlertSystemDown expr: | up{service=~"alert-processor|notification-service"} == 0 for: 1m labels: severity: critical component: alert-system annotations: summary: "Alert system is completely down" description: "Core alert system service {{ $labels.service }} is down." runbook_url: "https://runbooks.bakery-ia.local/AlertSystemDown" - alert: AlertDataNotPersisted expr: | ( sum(rate(alerts_processed_total[2m])) - sum(rate(alerts_stored_total[2m])) ) > 0 for: 2m labels: severity: critical component: alert-system annotations: summary: "Alerts not being persisted to database" description: "Alerts are being processed but not stored in the database." runbook_url: "https://runbooks.bakery-ia.local/AlertDataNotPersisted" - alert: NotificationsNotDelivered expr: | ( sum(rate(alerts_processed_total[3m])) - sum(rate(notifications_sent_total[3m])) ) > 0 for: 3m labels: severity: critical component: alert-system annotations: summary: "Notifications not being delivered" description: "Alerts are being processed but notifications are not being sent." runbook_url: "https://runbooks.bakery-ia.local/NotificationsNotDelivered" # Monitoring System Self-Monitoring - name: monitoring_health interval: 30s rules: - alert: PrometheusDown expr: up{job="prometheus"} == 0 for: 5m labels: severity: critical component: monitoring annotations: summary: "Prometheus is down" description: "Prometheus monitoring system is not responding." runbook_url: "https://runbooks.bakery-ia.local/PrometheusDown" - alert: AlertManagerDown expr: up{job="alertmanager"} == 0 for: 2m labels: severity: critical component: monitoring annotations: summary: "AlertManager is down" description: "AlertManager is not responding. Alerts will not be routed." runbook_url: "https://runbooks.bakery-ia.local/AlertManagerDown" - alert: PrometheusStorageFull expr: | ( prometheus_tsdb_storage_blocks_bytes / (prometheus_tsdb_storage_blocks_bytes + prometheus_tsdb_wal_size_bytes) ) > 0.90 for: 10m labels: severity: warning component: monitoring annotations: summary: "Prometheus storage almost full" description: "Prometheus storage is {{ $value | humanizePercentage }} full." runbook_url: "https://runbooks.bakery-ia.local/PrometheusStorageFull" - alert: PrometheusScrapeErrors expr: | rate(prometheus_target_scrapes_exceeded_sample_limit_total[5m]) > 0 for: 5m labels: severity: warning component: monitoring annotations: summary: "Prometheus scrape errors detected" description: "Prometheus is experiencing scrape errors for target {{ $labels.job }}." runbook_url: "https://runbooks.bakery-ia.local/PrometheusScrapeErrors"