430 lines
15 KiB
YAML
430 lines
15 KiB
YAML
|
|
---
|
||
|
|
apiVersion: v1
|
||
|
|
kind: ConfigMap
|
||
|
|
metadata:
|
||
|
|
name: prometheus-alert-rules
|
||
|
|
namespace: monitoring
|
||
|
|
data:
|
||
|
|
alert-rules.yml: |
|
||
|
|
groups:
|
||
|
|
# Basic Infrastructure Alerts
|
||
|
|
- name: bakery_services
|
||
|
|
interval: 30s
|
||
|
|
rules:
|
||
|
|
- alert: ServiceDown
|
||
|
|
expr: up{job="bakery-services"} == 0
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
component: infrastructure
|
||
|
|
annotations:
|
||
|
|
summary: "Service {{ $labels.service }} is down"
|
||
|
|
description: "Service {{ $labels.service }} in namespace {{ $labels.namespace }} has been down for more than 2 minutes."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/ServiceDown"
|
||
|
|
|
||
|
|
- alert: HighErrorRate
|
||
|
|
expr: |
|
||
|
|
(
|
||
|
|
sum(rate(http_requests_total{status_code=~"5..", job="bakery-services"}[5m])) by (service)
|
||
|
|
/
|
||
|
|
sum(rate(http_requests_total{job="bakery-services"}[5m])) by (service)
|
||
|
|
) > 0.10
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
component: application
|
||
|
|
annotations:
|
||
|
|
summary: "High error rate on {{ $labels.service }}"
|
||
|
|
description: "Service {{ $labels.service }} has error rate above 10% (current: {{ $value | humanizePercentage }})."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/HighErrorRate"
|
||
|
|
|
||
|
|
- alert: HighResponseTime
|
||
|
|
expr: |
|
||
|
|
histogram_quantile(0.95,
|
||
|
|
sum(rate(http_request_duration_seconds_bucket{job="bakery-services"}[5m])) by (service, le)
|
||
|
|
) > 1
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: performance
|
||
|
|
annotations:
|
||
|
|
summary: "High response time on {{ $labels.service }}"
|
||
|
|
description: "Service {{ $labels.service }} P95 latency is above 1 second (current: {{ $value }}s)."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/HighResponseTime"
|
||
|
|
|
||
|
|
- alert: HighMemoryUsage
|
||
|
|
expr: |
|
||
|
|
container_memory_usage_bytes{namespace="bakery-ia", container!=""} > 500000000
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: infrastructure
|
||
|
|
annotations:
|
||
|
|
summary: "High memory usage in {{ $labels.pod }}"
|
||
|
|
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using more than 500MB of memory (current: {{ $value | humanize }}B)."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/HighMemoryUsage"
|
||
|
|
|
||
|
|
- alert: DatabaseConnectionHigh
|
||
|
|
expr: |
|
||
|
|
pg_stat_database_numbackends{datname="bakery"} > 80
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: database
|
||
|
|
annotations:
|
||
|
|
summary: "High database connection count"
|
||
|
|
description: "Database has more than 80 active connections (current: {{ $value }})."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/DatabaseConnectionHigh"
|
||
|
|
|
||
|
|
# Business Logic Alerts
|
||
|
|
- name: bakery_business
|
||
|
|
interval: 30s
|
||
|
|
rules:
|
||
|
|
- alert: TrainingJobFailed
|
||
|
|
expr: |
|
||
|
|
increase(training_job_failures_total[1h]) > 0
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: ml-training
|
||
|
|
annotations:
|
||
|
|
summary: "Training job failures detected"
|
||
|
|
description: "{{ $value }} training job(s) failed in the last hour."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/TrainingJobFailed"
|
||
|
|
|
||
|
|
- alert: LowPredictionAccuracy
|
||
|
|
expr: |
|
||
|
|
prediction_model_accuracy < 0.70
|
||
|
|
for: 15m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: ml-inference
|
||
|
|
annotations:
|
||
|
|
summary: "Model prediction accuracy is low"
|
||
|
|
description: "Model {{ $labels.model_name }} accuracy is below 70% (current: {{ $value | humanizePercentage }})."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/LowPredictionAccuracy"
|
||
|
|
|
||
|
|
- alert: APIRateLimitHit
|
||
|
|
expr: |
|
||
|
|
increase(rate_limit_hits_total[5m]) > 10
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: info
|
||
|
|
component: api-gateway
|
||
|
|
annotations:
|
||
|
|
summary: "API rate limits being hit frequently"
|
||
|
|
description: "Rate limits hit {{ $value }} times in the last 5 minutes."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/APIRateLimitHit"
|
||
|
|
|
||
|
|
# Alert System Health
|
||
|
|
- name: alert_system_health
|
||
|
|
interval: 30s
|
||
|
|
rules:
|
||
|
|
- alert: AlertSystemComponentDown
|
||
|
|
expr: |
|
||
|
|
alert_system_component_health{component=~"processor|notifier|scheduler"} == 0
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "Alert system component {{ $labels.component }} is unhealthy"
|
||
|
|
description: "Component {{ $labels.component }} has been unhealthy for more than 2 minutes."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/AlertSystemComponentDown"
|
||
|
|
|
||
|
|
- alert: RabbitMQConnectionDown
|
||
|
|
expr: |
|
||
|
|
rabbitmq_up == 0
|
||
|
|
for: 1m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "RabbitMQ connection is down"
|
||
|
|
description: "Alert system has lost connection to RabbitMQ message queue."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/RabbitMQConnectionDown"
|
||
|
|
|
||
|
|
- alert: RedisConnectionDown
|
||
|
|
expr: |
|
||
|
|
redis_up == 0
|
||
|
|
for: 1m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "Redis connection is down"
|
||
|
|
description: "Alert system has lost connection to Redis cache."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/RedisConnectionDown"
|
||
|
|
|
||
|
|
- alert: NoSchedulerLeader
|
||
|
|
expr: |
|
||
|
|
sum(alert_system_scheduler_leader) == 0
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "No alert scheduler leader elected"
|
||
|
|
description: "No scheduler instance has been elected as leader for 5 minutes."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/NoSchedulerLeader"
|
||
|
|
|
||
|
|
# Alert System Performance
|
||
|
|
- name: alert_system_performance
|
||
|
|
interval: 30s
|
||
|
|
rules:
|
||
|
|
- alert: HighAlertProcessingErrorRate
|
||
|
|
expr: |
|
||
|
|
(
|
||
|
|
sum(rate(alert_processing_errors_total[2m]))
|
||
|
|
/
|
||
|
|
sum(rate(alerts_processed_total[2m]))
|
||
|
|
) > 0.10
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "High alert processing error rate"
|
||
|
|
description: "Alert processing error rate is above 10% (current: {{ $value | humanizePercentage }})."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingErrorRate"
|
||
|
|
|
||
|
|
- alert: HighNotificationDeliveryFailureRate
|
||
|
|
expr: |
|
||
|
|
(
|
||
|
|
sum(rate(notification_delivery_failures_total[3m]))
|
||
|
|
/
|
||
|
|
sum(rate(notifications_sent_total[3m]))
|
||
|
|
) > 0.05
|
||
|
|
for: 3m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "High notification delivery failure rate"
|
||
|
|
description: "Notification delivery failure rate is above 5% (current: {{ $value | humanizePercentage }})."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/HighNotificationDeliveryFailureRate"
|
||
|
|
|
||
|
|
- alert: HighAlertProcessingLatency
|
||
|
|
expr: |
|
||
|
|
histogram_quantile(0.95,
|
||
|
|
sum(rate(alert_processing_duration_seconds_bucket[5m])) by (le)
|
||
|
|
) > 5
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "High alert processing latency"
|
||
|
|
description: "P95 alert processing latency is above 5 seconds (current: {{ $value }}s)."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingLatency"
|
||
|
|
|
||
|
|
- alert: TooManySSEConnections
|
||
|
|
expr: |
|
||
|
|
sse_active_connections > 1000
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "Too many active SSE connections"
|
||
|
|
description: "More than 1000 active SSE connections (current: {{ $value }})."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/TooManySSEConnections"
|
||
|
|
|
||
|
|
- alert: SSEConnectionErrors
|
||
|
|
expr: |
|
||
|
|
rate(sse_connection_errors_total[3m]) > 0.5
|
||
|
|
for: 3m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "High rate of SSE connection errors"
|
||
|
|
description: "SSE connection error rate is {{ $value }} errors/sec."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/SSEConnectionErrors"
|
||
|
|
|
||
|
|
# Alert System Business Logic
|
||
|
|
- name: alert_system_business
|
||
|
|
interval: 30s
|
||
|
|
rules:
|
||
|
|
- alert: UnusuallyHighAlertVolume
|
||
|
|
expr: |
|
||
|
|
rate(alerts_generated_total[5m]) > 2
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "Unusually high alert generation volume"
|
||
|
|
description: "More than 2 alerts per second being generated (current: {{ $value }}/sec)."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/UnusuallyHighAlertVolume"
|
||
|
|
|
||
|
|
- alert: NoAlertsGenerated
|
||
|
|
expr: |
|
||
|
|
rate(alerts_generated_total[30m]) == 0
|
||
|
|
for: 15m
|
||
|
|
labels:
|
||
|
|
severity: info
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "No alerts generated recently"
|
||
|
|
description: "No alerts have been generated in the last 30 minutes. This might indicate a problem with alert detection."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/NoAlertsGenerated"
|
||
|
|
|
||
|
|
- alert: SlowAlertResponseTime
|
||
|
|
expr: |
|
||
|
|
histogram_quantile(0.95,
|
||
|
|
sum(rate(alert_response_time_seconds_bucket[10m])) by (le)
|
||
|
|
) > 3600
|
||
|
|
for: 10m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "Slow alert response times"
|
||
|
|
description: "P95 alert response time is above 1 hour (current: {{ $value | humanizeDuration }})."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/SlowAlertResponseTime"
|
||
|
|
|
||
|
|
- alert: CriticalAlertsUnacknowledged
|
||
|
|
expr: |
|
||
|
|
sum(alerts_unacknowledged{severity="critical"}) > 5
|
||
|
|
for: 10m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "Multiple critical alerts unacknowledged"
|
||
|
|
description: "{{ $value }} critical alerts have not been acknowledged for 10+ minutes."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/CriticalAlertsUnacknowledged"
|
||
|
|
|
||
|
|
# Alert System Capacity
|
||
|
|
- name: alert_system_capacity
|
||
|
|
interval: 30s
|
||
|
|
rules:
|
||
|
|
- alert: LargeSSEMessageQueues
|
||
|
|
expr: |
|
||
|
|
sse_message_queue_size > 100
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "Large SSE message queues detected"
|
||
|
|
description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages queued."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/LargeSSEMessageQueues"
|
||
|
|
|
||
|
|
- alert: SlowDatabaseStorage
|
||
|
|
expr: |
|
||
|
|
histogram_quantile(0.95,
|
||
|
|
sum(rate(alert_storage_duration_seconds_bucket[5m])) by (le)
|
||
|
|
) > 1
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "Slow alert database storage"
|
||
|
|
description: "P95 alert storage latency is above 1 second (current: {{ $value }}s)."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/SlowDatabaseStorage"
|
||
|
|
|
||
|
|
# Alert System Critical Scenarios
|
||
|
|
- name: alert_system_critical
|
||
|
|
interval: 15s
|
||
|
|
rules:
|
||
|
|
- alert: AlertSystemDown
|
||
|
|
expr: |
|
||
|
|
up{service=~"alert-processor|notification-service"} == 0
|
||
|
|
for: 1m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "Alert system is completely down"
|
||
|
|
description: "Core alert system service {{ $labels.service }} is down."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/AlertSystemDown"
|
||
|
|
|
||
|
|
- alert: AlertDataNotPersisted
|
||
|
|
expr: |
|
||
|
|
(
|
||
|
|
sum(rate(alerts_processed_total[2m]))
|
||
|
|
-
|
||
|
|
sum(rate(alerts_stored_total[2m]))
|
||
|
|
) > 0
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "Alerts not being persisted to database"
|
||
|
|
description: "Alerts are being processed but not stored in the database."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/AlertDataNotPersisted"
|
||
|
|
|
||
|
|
- alert: NotificationsNotDelivered
|
||
|
|
expr: |
|
||
|
|
(
|
||
|
|
sum(rate(alerts_processed_total[3m]))
|
||
|
|
-
|
||
|
|
sum(rate(notifications_sent_total[3m]))
|
||
|
|
) > 0
|
||
|
|
for: 3m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
component: alert-system
|
||
|
|
annotations:
|
||
|
|
summary: "Notifications not being delivered"
|
||
|
|
description: "Alerts are being processed but notifications are not being sent."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/NotificationsNotDelivered"
|
||
|
|
|
||
|
|
# Monitoring System Self-Monitoring
|
||
|
|
- name: monitoring_health
|
||
|
|
interval: 30s
|
||
|
|
rules:
|
||
|
|
- alert: PrometheusDown
|
||
|
|
expr: up{job="prometheus"} == 0
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
component: monitoring
|
||
|
|
annotations:
|
||
|
|
summary: "Prometheus is down"
|
||
|
|
description: "Prometheus monitoring system is not responding."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/PrometheusDown"
|
||
|
|
|
||
|
|
- alert: AlertManagerDown
|
||
|
|
expr: up{job="alertmanager"} == 0
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
component: monitoring
|
||
|
|
annotations:
|
||
|
|
summary: "AlertManager is down"
|
||
|
|
description: "AlertManager is not responding. Alerts will not be routed."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/AlertManagerDown"
|
||
|
|
|
||
|
|
- alert: PrometheusStorageFull
|
||
|
|
expr: |
|
||
|
|
(
|
||
|
|
prometheus_tsdb_storage_blocks_bytes
|
||
|
|
/
|
||
|
|
(prometheus_tsdb_storage_blocks_bytes + prometheus_tsdb_wal_size_bytes)
|
||
|
|
) > 0.90
|
||
|
|
for: 10m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: monitoring
|
||
|
|
annotations:
|
||
|
|
summary: "Prometheus storage almost full"
|
||
|
|
description: "Prometheus storage is {{ $value | humanizePercentage }} full."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/PrometheusStorageFull"
|
||
|
|
|
||
|
|
- alert: PrometheusScrapeErrors
|
||
|
|
expr: |
|
||
|
|
rate(prometheus_target_scrapes_exceeded_sample_limit_total[5m]) > 0
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
component: monitoring
|
||
|
|
annotations:
|
||
|
|
summary: "Prometheus scrape errors detected"
|
||
|
|
description: "Prometheus is experiencing scrape errors for target {{ $labels.job }}."
|
||
|
|
runbook_url: "https://runbooks.bakery-ia.local/PrometheusScrapeErrors"
|