86 lines
2.8 KiB
YAML
86 lines
2.8 KiB
YAML
|
|
# infrastructure/monitoring/prometheus/rules/alerts.yml
|
||
|
|
# Prometheus alerting rules
|
||
|
|
|
||
|
|
groups:
|
||
|
|
- name: bakery_services
|
||
|
|
rules:
|
||
|
|
# Service availability alerts
|
||
|
|
- alert: ServiceDown
|
||
|
|
expr: up == 0
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
annotations:
|
||
|
|
summary: "Service {{ $labels.job }} is down"
|
||
|
|
description: "Service {{ $labels.job }} has been down for more than 2 minutes."
|
||
|
|
|
||
|
|
# High error rate alerts
|
||
|
|
- alert: HighErrorRate
|
||
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High error rate on {{ $labels.job }}"
|
||
|
|
description: "Error rate is {{ $value }} errors per second on {{ $labels.job }}."
|
||
|
|
|
||
|
|
# High response time alerts
|
||
|
|
- alert: HighResponseTime
|
||
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High response time on {{ $labels.job }}"
|
||
|
|
description: "95th percentile response time is {{ $value }}s on {{ $labels.job }}."
|
||
|
|
|
||
|
|
# Memory usage alerts
|
||
|
|
- alert: HighMemoryUsage
|
||
|
|
expr: process_resident_memory_bytes / 1024 / 1024 > 500
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High memory usage on {{ $labels.job }}"
|
||
|
|
description: "Memory usage is {{ $value }}MB on {{ $labels.job }}."
|
||
|
|
|
||
|
|
# Database connection alerts
|
||
|
|
- alert: DatabaseConnectionHigh
|
||
|
|
expr: pg_stat_activity_count > 80
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High database connections"
|
||
|
|
description: "Database has {{ $value }} active connections."
|
||
|
|
|
||
|
|
- name: bakery_business
|
||
|
|
rules:
|
||
|
|
# Training job alerts
|
||
|
|
- alert: TrainingJobFailed
|
||
|
|
expr: increase(training_jobs_failed_total[1h]) > 0
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "Training job failed"
|
||
|
|
description: "{{ $value }} training jobs have failed in the last hour."
|
||
|
|
|
||
|
|
# Prediction accuracy alerts
|
||
|
|
- alert: LowPredictionAccuracy
|
||
|
|
expr: prediction_accuracy < 0.7
|
||
|
|
for: 15m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "Low prediction accuracy"
|
||
|
|
description: "Prediction accuracy is {{ $value }} for tenant {{ $labels.tenant_id }}."
|
||
|
|
|
||
|
|
# API rate limit alerts
|
||
|
|
- alert: APIRateLimitHit
|
||
|
|
expr: increase(rate_limit_hits_total[5m]) > 10
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "API rate limit hit frequently"
|
||
|
|
description: "Rate limit has been hit {{ $value }} times in 5 minutes."
|