Improve monitoring for prod

This commit is contained in:
Urtzi Alfaro
2026-01-07 19:12:35 +01:00
parent 560c7ba86f
commit 07178f8972
44 changed files with 6581 additions and 5111 deletions

View File

@@ -1,644 +0,0 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Comprehensive monitoring dashboard for the Bakery Alert and Recommendation System",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "rate(alert_items_published_total[5m])",
"interval": "",
"legendFormat": "{{item_type}} - {{severity}}",
"refId": "A"
}
],
"title": "Alert/Recommendation Publishing Rate",
"type": "timeseries"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"text": {}
},
"pluginVersion": "8.0.0",
"targets": [
{
"expr": "sum(alert_sse_active_connections)",
"interval": "",
"legendFormat": "Active SSE Connections",
"refId": "A"
}
],
"title": "Active SSE Connections",
"type": "gauge"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
}
},
"mappings": []
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 8
},
"id": 3,
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "sum by (item_type) (alert_items_published_total)",
"interval": "",
"legendFormat": "{{item_type}}",
"refId": "A"
}
],
"title": "Items by Type",
"type": "piechart"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
}
},
"mappings": []
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 8
},
"id": 4,
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "sum by (severity) (alert_items_published_total)",
"interval": "",
"legendFormat": "{{severity}}",
"refId": "A"
}
],
"title": "Items by Severity",
"type": "piechart"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 8
},
"id": 5,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "rate(alert_notifications_sent_total[5m])",
"interval": "",
"legendFormat": "{{channel}}",
"refId": "A"
}
],
"title": "Notification Delivery Rate by Channel",
"type": "timeseries"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 6,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "histogram_quantile(0.95, rate(alert_processing_duration_seconds_bucket[5m]))",
"interval": "",
"legendFormat": "95th percentile",
"refId": "A"
},
{
"expr": "histogram_quantile(0.50, rate(alert_processing_duration_seconds_bucket[5m]))",
"interval": "",
"legendFormat": "50th percentile (median)",
"refId": "B"
}
],
"title": "Processing Duration",
"type": "timeseries"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"id": 7,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "rate(alert_processing_errors_total[5m])",
"interval": "",
"legendFormat": "{{error_type}}",
"refId": "A"
},
{
"expr": "rate(alert_delivery_failures_total[5m])",
"interval": "",
"legendFormat": "Delivery: {{channel}}",
"refId": "B"
}
],
"title": "Error Rates",
"type": "timeseries"
},
{
"datasource": "prometheus",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": "auto",
"displayMode": "auto"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Health"
},
"properties": [
{
"id": "custom.displayMode",
"value": "color-background"
},
{
"id": "mappings",
"value": [
{
"options": {
"0": {
"color": "red",
"index": 0,
"text": "Unhealthy"
},
"1": {
"color": "green",
"index": 1,
"text": "Healthy"
}
},
"type": "value"
}
]
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 24
},
"id": 8,
"options": {
"showHeader": true
},
"pluginVersion": "8.0.0",
"targets": [
{
"expr": "alert_system_component_health",
"format": "table",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"title": "System Component Health",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"__name__": true,
"instance": true,
"job": true
},
"indexByName": {},
"renameByName": {
"Value": "Health",
"component": "Component",
"service": "Service"
}
}
}
],
"type": "table"
}
],
"schemaVersion": 27,
"style": "dark",
"tags": [
"bakery",
"alerts",
"recommendations",
"monitoring"
],
"templating": {
"list": []
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "Europe/Madrid",
"title": "Bakery Alert & Recommendation System",
"uid": "bakery-alert-system",
"version": 1
}

View File

@@ -1,15 +0,0 @@
# infrastructure/monitoring/grafana/dashboards/dashboard.yml
# Grafana dashboard provisioning
apiVersion: 1
providers:
- name: 'bakery-dashboards'
orgId: 1
folder: 'Bakery Forecasting'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards

View File

@@ -1,28 +0,0 @@
# infrastructure/monitoring/grafana/datasources/prometheus.yml
# Grafana Prometheus datasource configuration
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
version: 1
editable: true
jsonData:
timeInterval: "15s"
queryTimeout: "60s"
httpMethod: "POST"
exemplarTraceIdDestinations:
- name: trace_id
datasourceUid: jaeger
- name: Jaeger
type: jaeger
access: proxy
url: http://jaeger:16686
uid: jaeger
version: 1
editable: true

View File

@@ -1,42 +0,0 @@
# ================================================================
# Monitoring Configuration: infrastructure/monitoring/prometheus/forecasting-service.yml
# ================================================================
groups:
- name: forecasting-service
rules:
- alert: ForecastingServiceDown
expr: up{job="forecasting-service"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Forecasting service is down"
description: "Forecasting service has been down for more than 1 minute"
- alert: HighForecastingLatency
expr: histogram_quantile(0.95, forecast_processing_time_seconds) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "High forecasting latency"
description: "95th percentile forecasting latency is {{ $value }}s"
- alert: ForecastingErrorRate
expr: rate(forecasting_errors_total[5m]) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "High forecasting error rate"
description: "Forecasting error rate is {{ $value }} errors/sec"
- alert: LowModelAccuracy
expr: avg(model_accuracy_score) < 0.7
for: 10m
labels:
severity: warning
annotations:
summary: "Low model accuracy detected"
description: "Average model accuracy is {{ $value }}"

View File

@@ -1,88 +0,0 @@
# infrastructure/monitoring/prometheus/prometheus.yml
# Prometheus configuration
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'bakery-forecasting'
replica: 'prometheus-01'
rule_files:
- "/etc/prometheus/rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
scrape_configs:
# Service discovery for microservices
- job_name: 'gateway'
static_configs:
- targets: ['gateway-service:8000']
metrics_path: '/metrics'
scrape_interval: 30s
scrape_timeout: 10s
- job_name: 'auth-service'
static_configs:
- targets: ['auth-service:8000']
metrics_path: '/metrics'
scrape_interval: 30s
- job_name: 'tenant-service'
static_configs:
- targets: ['tenant-service:8000']
metrics_path: '/metrics'
scrape_interval: 30s
- job_name: 'training-service'
static_configs:
- targets: ['training-service:8000']
metrics_path: '/metrics'
scrape_interval: 30s
- job_name: 'forecasting-service'
static_configs:
- targets: ['forecasting-service:8000']
metrics_path: '/metrics'
scrape_interval: 30s
- job_name: 'sales-service'
static_configs:
- targets: ['sales-service:8000']
metrics_path: '/metrics'
scrape_interval: 30s
- job_name: 'external-service'
static_configs:
- targets: ['external-service:8000']
metrics_path: '/metrics'
scrape_interval: 30s
- job_name: 'notification-service'
static_configs:
- targets: ['notification-service:8000']
metrics_path: '/metrics'
scrape_interval: 30s
# Infrastructure monitoring
- job_name: 'redis'
static_configs:
- targets: ['redis:6379']
metrics_path: '/metrics'
scrape_interval: 30s
- job_name: 'rabbitmq'
static_configs:
- targets: ['rabbitmq:15692']
metrics_path: '/metrics'
scrape_interval: 30s
# Database monitoring (requires postgres_exporter)
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
scrape_interval: 30s

View File

@@ -1,243 +0,0 @@
# infrastructure/monitoring/prometheus/rules/alert-system-rules.yml
# Prometheus alerting rules for the Bakery Alert and Recommendation System
groups:
- name: alert_system_health
rules:
# System component health alerts
- alert: AlertSystemComponentDown
expr: alert_system_component_health == 0
for: 2m
labels:
severity: critical
service: "{{ $labels.service }}"
component: "{{ $labels.component }}"
annotations:
summary: "Alert system component {{ $labels.component }} is unhealthy"
description: "Component {{ $labels.component }} in service {{ $labels.service }} has been unhealthy for more than 2 minutes."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#component-health"
# Connection health alerts
- alert: RabbitMQConnectionDown
expr: alert_rabbitmq_connection_status == 0
for: 1m
labels:
severity: critical
service: "{{ $labels.service }}"
annotations:
summary: "RabbitMQ connection down for {{ $labels.service }}"
description: "Service {{ $labels.service }} has lost connection to RabbitMQ for more than 1 minute."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#rabbitmq-connection"
- alert: RedisConnectionDown
expr: alert_redis_connection_status == 0
for: 1m
labels:
severity: critical
service: "{{ $labels.service }}"
annotations:
summary: "Redis connection down for {{ $labels.service }}"
description: "Service {{ $labels.service }} has lost connection to Redis for more than 1 minute."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#redis-connection"
# Leader election issues
- alert: NoSchedulerLeader
expr: sum(alert_scheduler_leader_status) == 0
for: 5m
labels:
severity: warning
annotations:
summary: "No scheduler leader elected"
description: "No service has been elected as scheduler leader for more than 5 minutes. Scheduled checks may not be running."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#leader-election"
- name: alert_system_performance
rules:
# High error rates
- alert: HighAlertProcessingErrorRate
expr: rate(alert_processing_errors_total[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "High alert processing error rate"
description: "Alert processing error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-errors"
- alert: HighNotificationDeliveryFailureRate
expr: rate(alert_delivery_failures_total[5m]) / rate(alert_notifications_sent_total[5m]) > 0.05
for: 3m
labels:
severity: warning
channel: "{{ $labels.channel }}"
annotations:
summary: "High notification delivery failure rate for {{ $labels.channel }}"
description: "Notification delivery failure rate for {{ $labels.channel }} is {{ $value | humanizePercentage }} over the last 5 minutes."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#delivery-failures"
# Processing latency
- alert: HighAlertProcessingLatency
expr: histogram_quantile(0.95, rate(alert_processing_duration_seconds_bucket[5m])) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "High alert processing latency"
description: "95th percentile alert processing latency is {{ $value }}s, exceeding 5s threshold."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-latency"
# SSE connection issues
- alert: TooManySSEConnections
expr: sum(alert_sse_active_connections) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: "Too many active SSE connections"
description: "Number of active SSE connections ({{ $value }}) exceeds 1000. This may impact performance."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-connections"
- alert: SSEConnectionErrors
expr: rate(alert_sse_connection_errors_total[5m]) > 0.5
for: 3m
labels:
severity: warning
annotations:
summary: "High SSE connection error rate"
description: "SSE connection error rate is {{ $value }} errors/second over the last 5 minutes."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-errors"
- name: alert_system_business
rules:
# Alert volume anomalies
- alert: UnusuallyHighAlertVolume
expr: rate(alert_items_published_total{item_type="alert"}[10m]) > 2
for: 5m
labels:
severity: warning
service: "{{ $labels.service }}"
annotations:
summary: "Unusually high alert volume from {{ $labels.service }}"
description: "Service {{ $labels.service }} is generating alerts at {{ $value }} alerts/second, which is above normal levels."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#high-volume"
- alert: NoAlertsGenerated
expr: rate(alert_items_published_total[30m]) == 0
for: 15m
labels:
severity: warning
annotations:
summary: "No alerts generated recently"
description: "No alerts have been generated in the last 30 minutes. This may indicate a problem with detection systems."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#no-alerts"
# Response time issues
- alert: SlowAlertResponseTime
expr: histogram_quantile(0.95, rate(alert_item_response_time_seconds_bucket[1h])) > 3600
for: 10m
labels:
severity: warning
annotations:
summary: "Slow alert response times"
description: "95th percentile alert response time is {{ $value | humanizeDuration }}, exceeding 1 hour."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#response-times"
# Critical alerts not acknowledged
- alert: CriticalAlertsUnacknowledged
expr: sum(alert_active_items_current{item_type="alert",severity="urgent"}) > 5
for: 10m
labels:
severity: critical
annotations:
summary: "Multiple critical alerts unacknowledged"
description: "{{ $value }} critical alerts remain unacknowledged for more than 10 minutes."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#critical-unacked"
- name: alert_system_capacity
rules:
# Queue size monitoring
- alert: LargeSSEMessageQueues
expr: alert_sse_message_queue_size > 100
for: 5m
labels:
severity: warning
tenant_id: "{{ $labels.tenant_id }}"
annotations:
summary: "Large SSE message queue for tenant {{ $labels.tenant_id }}"
description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages, indicating potential client issues."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-queues"
# Database storage issues
- alert: SlowDatabaseStorage
expr: histogram_quantile(0.95, rate(alert_database_storage_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Slow database storage for alerts"
description: "95th percentile database storage time is {{ $value }}s, exceeding 1s threshold."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#database-storage"
- name: alert_system_effectiveness
rules:
# False positive rate monitoring
- alert: HighFalsePositiveRate
expr: alert_false_positive_rate > 0.2
for: 30m
labels:
severity: warning
service: "{{ $labels.service }}"
alert_type: "{{ $labels.alert_type }}"
annotations:
summary: "High false positive rate for {{ $labels.alert_type }}"
description: "False positive rate for {{ $labels.alert_type }} in {{ $labels.service }} is {{ $value | humanizePercentage }}."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#false-positives"
# Low recommendation adoption
- alert: LowRecommendationAdoption
expr: rate(alert_recommendations_implemented_total[24h]) / rate(alert_items_published_total{item_type="recommendation"}[24h]) < 0.1
for: 1h
labels:
severity: info
service: "{{ $labels.service }}"
annotations:
summary: "Low recommendation adoption rate"
description: "Recommendation adoption rate for {{ $labels.service }} is {{ $value | humanizePercentage }} over the last 24 hours."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#recommendation-adoption"
# Additional alerting rules for specific scenarios
- name: alert_system_critical_scenarios
rules:
# Complete system failure
- alert: AlertSystemDown
expr: up{job=~"alert-processor|notification-service"} == 0
for: 1m
labels:
severity: critical
service: "{{ $labels.job }}"
annotations:
summary: "Alert system service {{ $labels.job }} is down"
description: "Critical alert system service {{ $labels.job }} has been down for more than 1 minute."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#service-down"
# Data loss prevention
- alert: AlertDataNotPersisted
expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_database_storage_duration_seconds_count[5m]) == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Alert data not being persisted to database"
description: "Alerts are being processed but not stored in database, potential data loss."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#data-persistence"
# Notification blackhole
- alert: NotificationsNotDelivered
expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_notifications_sent_total[5m]) == 0
for: 3m
labels:
severity: critical
annotations:
summary: "Notifications not being delivered"
description: "Alerts are being processed but no notifications are being sent."
runbook_url: "https://docs.bakery.local/runbooks/alert-system#notification-delivery"

View File

@@ -1,86 +0,0 @@
# infrastructure/monitoring/prometheus/rules/alerts.yml
# Prometheus alerting rules
groups:
- name: bakery_services
rules:
# Service availability alerts
- alert: ServiceDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "Service {{ $labels.job }} has been down for more than 2 minutes."
# High error rate alerts
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate on {{ $labels.job }}"
description: "Error rate is {{ $value }} errors per second on {{ $labels.job }}."
# High response time alerts
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High response time on {{ $labels.job }}"
description: "95th percentile response time is {{ $value }}s on {{ $labels.job }}."
# Memory usage alerts
- alert: HighMemoryUsage
expr: process_resident_memory_bytes / 1024 / 1024 > 500
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.job }}"
description: "Memory usage is {{ $value }}MB on {{ $labels.job }}."
# Database connection alerts
- alert: DatabaseConnectionHigh
expr: pg_stat_activity_count > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High database connections"
description: "Database has {{ $value }} active connections."
- name: bakery_business
rules:
# Training job alerts
- alert: TrainingJobFailed
expr: increase(training_jobs_failed_total[1h]) > 0
labels:
severity: warning
annotations:
summary: "Training job failed"
description: "{{ $value }} training jobs have failed in the last hour."
# Prediction accuracy alerts
- alert: LowPredictionAccuracy
expr: prediction_accuracy < 0.7
for: 15m
labels:
severity: warning
annotations:
summary: "Low prediction accuracy"
description: "Prediction accuracy is {{ $value }} for tenant {{ $labels.tenant_id }}."
# API rate limit alerts
- alert: APIRateLimitHit
expr: increase(rate_limit_hits_total[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "API rate limit hit frequently"
description: "Rate limit has been hit {{ $value }} times in 5 minutes."