# infrastructure/monitoring/prometheus/rules/alerts.yml # Prometheus alerting rules groups: - name: bakery_services rules: # Service availability alerts - alert: ServiceDown expr: up == 0 for: 2m labels: severity: critical annotations: summary: "Service {{ $labels.job }} is down" description: "Service {{ $labels.job }} has been down for more than 2 minutes." # High error rate alerts - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1 for: 5m labels: severity: warning annotations: summary: "High error rate on {{ $labels.job }}" description: "Error rate is {{ $value }} errors per second on {{ $labels.job }}." # High response time alerts - alert: HighResponseTime expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1 for: 5m labels: severity: warning annotations: summary: "High response time on {{ $labels.job }}" description: "95th percentile response time is {{ $value }}s on {{ $labels.job }}." # Memory usage alerts - alert: HighMemoryUsage expr: process_resident_memory_bytes / 1024 / 1024 > 500 for: 5m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.job }}" description: "Memory usage is {{ $value }}MB on {{ $labels.job }}." # Database connection alerts - alert: DatabaseConnectionHigh expr: pg_stat_activity_count > 80 for: 5m labels: severity: warning annotations: summary: "High database connections" description: "Database has {{ $value }} active connections." - name: bakery_business rules: # Training job alerts - alert: TrainingJobFailed expr: increase(training_jobs_failed_total[1h]) > 0 labels: severity: warning annotations: summary: "Training job failed" description: "{{ $value }} training jobs have failed in the last hour." # Prediction accuracy alerts - alert: LowPredictionAccuracy expr: prediction_accuracy < 0.7 for: 15m labels: severity: warning annotations: summary: "Low prediction accuracy" description: "Prediction accuracy is {{ $value }} for tenant {{ $labels.tenant_id }}." # API rate limit alerts - alert: APIRateLimitHit expr: increase(rate_limit_hits_total[5m]) > 10 for: 5m labels: severity: warning annotations: summary: "API rate limit hit frequently" description: "Rate limit has been hit {{ $value }} times in 5 minutes."