Improve monitoring for prod
This commit is contained in:
@@ -1,644 +0,0 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Comprehensive monitoring dashboard for the Bakery Alert and Recommendation System",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": "prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"vis": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(alert_items_published_total[5m])",
|
||||
"interval": "",
|
||||
"legendFormat": "{{item_type}} - {{severity}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Alert/Recommendation Publishing Rate",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true,
|
||||
"text": {}
|
||||
},
|
||||
"pluginVersion": "8.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(alert_sse_active_connections)",
|
||||
"interval": "",
|
||||
"legendFormat": "Active SSE Connections",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Active SSE Connections",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": "prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"vis": false
|
||||
}
|
||||
},
|
||||
"mappings": []
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "right"
|
||||
},
|
||||
"pieType": "pie",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (item_type) (alert_items_published_total)",
|
||||
"interval": "",
|
||||
"legendFormat": "{{item_type}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Items by Type",
|
||||
"type": "piechart"
|
||||
},
|
||||
{
|
||||
"datasource": "prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"vis": false
|
||||
}
|
||||
},
|
||||
"mappings": []
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 8
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "right"
|
||||
},
|
||||
"pieType": "pie",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (severity) (alert_items_published_total)",
|
||||
"interval": "",
|
||||
"legendFormat": "{{severity}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Items by Severity",
|
||||
"type": "piechart"
|
||||
},
|
||||
{
|
||||
"datasource": "prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"vis": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 8
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(alert_notifications_sent_total[5m])",
|
||||
"interval": "",
|
||||
"legendFormat": "{{channel}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Notification Delivery Rate by Channel",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"vis": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(alert_processing_duration_seconds_bucket[5m]))",
|
||||
"interval": "",
|
||||
"legendFormat": "95th percentile",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(alert_processing_duration_seconds_bucket[5m]))",
|
||||
"interval": "",
|
||||
"legendFormat": "50th percentile (median)",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Processing Duration",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"vis": false
|
||||
},
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(alert_processing_errors_total[5m])",
|
||||
"interval": "",
|
||||
"legendFormat": "{{error_type}}",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(alert_delivery_failures_total[5m])",
|
||||
"interval": "",
|
||||
"legendFormat": "Delivery: {{channel}}",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Error Rates",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": "prometheus",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"custom": {
|
||||
"align": "auto",
|
||||
"displayMode": "auto"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Health"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "custom.displayMode",
|
||||
"value": "color-background"
|
||||
},
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"color": "red",
|
||||
"index": 0,
|
||||
"text": "Unhealthy"
|
||||
},
|
||||
"1": {
|
||||
"color": "green",
|
||||
"index": 1,
|
||||
"text": "Healthy"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"pluginVersion": "8.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "alert_system_component_health",
|
||||
"format": "table",
|
||||
"interval": "",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "System Component Health",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"__name__": true,
|
||||
"instance": true,
|
||||
"job": true
|
||||
},
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"Value": "Health",
|
||||
"component": "Component",
|
||||
"service": "Service"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 27,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"bakery",
|
||||
"alerts",
|
||||
"recommendations",
|
||||
"monitoring"
|
||||
],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "Europe/Madrid",
|
||||
"title": "Bakery Alert & Recommendation System",
|
||||
"uid": "bakery-alert-system",
|
||||
"version": 1
|
||||
}
|
||||
@@ -1,15 +0,0 @@
|
||||
# infrastructure/monitoring/grafana/dashboards/dashboard.yml
|
||||
# Grafana dashboard provisioning
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'bakery-dashboards'
|
||||
orgId: 1
|
||||
folder: 'Bakery Forecasting'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards
|
||||
@@ -1,28 +0,0 @@
|
||||
# infrastructure/monitoring/grafana/datasources/prometheus.yml
|
||||
# Grafana Prometheus datasource configuration
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
version: 1
|
||||
editable: true
|
||||
jsonData:
|
||||
timeInterval: "15s"
|
||||
queryTimeout: "60s"
|
||||
httpMethod: "POST"
|
||||
exemplarTraceIdDestinations:
|
||||
- name: trace_id
|
||||
datasourceUid: jaeger
|
||||
|
||||
- name: Jaeger
|
||||
type: jaeger
|
||||
access: proxy
|
||||
url: http://jaeger:16686
|
||||
uid: jaeger
|
||||
version: 1
|
||||
editable: true
|
||||
@@ -1,42 +0,0 @@
|
||||
# ================================================================
|
||||
# Monitoring Configuration: infrastructure/monitoring/prometheus/forecasting-service.yml
|
||||
# ================================================================
|
||||
groups:
|
||||
- name: forecasting-service
|
||||
rules:
|
||||
- alert: ForecastingServiceDown
|
||||
expr: up{job="forecasting-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Forecasting service is down"
|
||||
description: "Forecasting service has been down for more than 1 minute"
|
||||
|
||||
- alert: HighForecastingLatency
|
||||
expr: histogram_quantile(0.95, forecast_processing_time_seconds) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High forecasting latency"
|
||||
description: "95th percentile forecasting latency is {{ $value }}s"
|
||||
|
||||
- alert: ForecastingErrorRate
|
||||
expr: rate(forecasting_errors_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High forecasting error rate"
|
||||
description: "Forecasting error rate is {{ $value }} errors/sec"
|
||||
|
||||
- alert: LowModelAccuracy
|
||||
expr: avg(model_accuracy_score) < 0.7
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low model accuracy detected"
|
||||
description: "Average model accuracy is {{ $value }}"
|
||||
|
||||
@@ -1,88 +0,0 @@
|
||||
# infrastructure/monitoring/prometheus/prometheus.yml
|
||||
# Prometheus configuration
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'bakery-forecasting'
|
||||
replica: 'prometheus-01'
|
||||
|
||||
rule_files:
|
||||
- "/etc/prometheus/rules/*.yml"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
# - alertmanager:9093
|
||||
|
||||
scrape_configs:
|
||||
# Service discovery for microservices
|
||||
- job_name: 'gateway'
|
||||
static_configs:
|
||||
- targets: ['gateway-service:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 10s
|
||||
|
||||
- job_name: 'auth-service'
|
||||
static_configs:
|
||||
- targets: ['auth-service:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'tenant-service'
|
||||
static_configs:
|
||||
- targets: ['tenant-service:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'training-service'
|
||||
static_configs:
|
||||
- targets: ['training-service:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'forecasting-service'
|
||||
static_configs:
|
||||
- targets: ['forecasting-service:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'sales-service'
|
||||
static_configs:
|
||||
- targets: ['sales-service:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'external-service'
|
||||
static_configs:
|
||||
- targets: ['external-service:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'notification-service'
|
||||
static_configs:
|
||||
- targets: ['notification-service:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
# Infrastructure monitoring
|
||||
- job_name: 'redis'
|
||||
static_configs:
|
||||
- targets: ['redis:6379']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'rabbitmq'
|
||||
static_configs:
|
||||
- targets: ['rabbitmq:15692']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
# Database monitoring (requires postgres_exporter)
|
||||
- job_name: 'postgres'
|
||||
static_configs:
|
||||
- targets: ['postgres-exporter:9187']
|
||||
scrape_interval: 30s
|
||||
@@ -1,243 +0,0 @@
|
||||
# infrastructure/monitoring/prometheus/rules/alert-system-rules.yml
|
||||
# Prometheus alerting rules for the Bakery Alert and Recommendation System
|
||||
|
||||
groups:
|
||||
- name: alert_system_health
|
||||
rules:
|
||||
# System component health alerts
|
||||
- alert: AlertSystemComponentDown
|
||||
expr: alert_system_component_health == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: "{{ $labels.service }}"
|
||||
component: "{{ $labels.component }}"
|
||||
annotations:
|
||||
summary: "Alert system component {{ $labels.component }} is unhealthy"
|
||||
description: "Component {{ $labels.component }} in service {{ $labels.service }} has been unhealthy for more than 2 minutes."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#component-health"
|
||||
|
||||
# Connection health alerts
|
||||
- alert: RabbitMQConnectionDown
|
||||
expr: alert_rabbitmq_connection_status == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: "{{ $labels.service }}"
|
||||
annotations:
|
||||
summary: "RabbitMQ connection down for {{ $labels.service }}"
|
||||
description: "Service {{ $labels.service }} has lost connection to RabbitMQ for more than 1 minute."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#rabbitmq-connection"
|
||||
|
||||
- alert: RedisConnectionDown
|
||||
expr: alert_redis_connection_status == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: "{{ $labels.service }}"
|
||||
annotations:
|
||||
summary: "Redis connection down for {{ $labels.service }}"
|
||||
description: "Service {{ $labels.service }} has lost connection to Redis for more than 1 minute."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#redis-connection"
|
||||
|
||||
# Leader election issues
|
||||
- alert: NoSchedulerLeader
|
||||
expr: sum(alert_scheduler_leader_status) == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "No scheduler leader elected"
|
||||
description: "No service has been elected as scheduler leader for more than 5 minutes. Scheduled checks may not be running."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#leader-election"
|
||||
|
||||
- name: alert_system_performance
|
||||
rules:
|
||||
# High error rates
|
||||
- alert: HighAlertProcessingErrorRate
|
||||
expr: rate(alert_processing_errors_total[5m]) > 0.1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High alert processing error rate"
|
||||
description: "Alert processing error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-errors"
|
||||
|
||||
- alert: HighNotificationDeliveryFailureRate
|
||||
expr: rate(alert_delivery_failures_total[5m]) / rate(alert_notifications_sent_total[5m]) > 0.05
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
channel: "{{ $labels.channel }}"
|
||||
annotations:
|
||||
summary: "High notification delivery failure rate for {{ $labels.channel }}"
|
||||
description: "Notification delivery failure rate for {{ $labels.channel }} is {{ $value | humanizePercentage }} over the last 5 minutes."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#delivery-failures"
|
||||
|
||||
# Processing latency
|
||||
- alert: HighAlertProcessingLatency
|
||||
expr: histogram_quantile(0.95, rate(alert_processing_duration_seconds_bucket[5m])) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High alert processing latency"
|
||||
description: "95th percentile alert processing latency is {{ $value }}s, exceeding 5s threshold."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-latency"
|
||||
|
||||
# SSE connection issues
|
||||
- alert: TooManySSEConnections
|
||||
expr: sum(alert_sse_active_connections) > 1000
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Too many active SSE connections"
|
||||
description: "Number of active SSE connections ({{ $value }}) exceeds 1000. This may impact performance."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-connections"
|
||||
|
||||
- alert: SSEConnectionErrors
|
||||
expr: rate(alert_sse_connection_errors_total[5m]) > 0.5
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High SSE connection error rate"
|
||||
description: "SSE connection error rate is {{ $value }} errors/second over the last 5 minutes."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-errors"
|
||||
|
||||
- name: alert_system_business
|
||||
rules:
|
||||
# Alert volume anomalies
|
||||
- alert: UnusuallyHighAlertVolume
|
||||
expr: rate(alert_items_published_total{item_type="alert"}[10m]) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: "{{ $labels.service }}"
|
||||
annotations:
|
||||
summary: "Unusually high alert volume from {{ $labels.service }}"
|
||||
description: "Service {{ $labels.service }} is generating alerts at {{ $value }} alerts/second, which is above normal levels."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#high-volume"
|
||||
|
||||
- alert: NoAlertsGenerated
|
||||
expr: rate(alert_items_published_total[30m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "No alerts generated recently"
|
||||
description: "No alerts have been generated in the last 30 minutes. This may indicate a problem with detection systems."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#no-alerts"
|
||||
|
||||
# Response time issues
|
||||
- alert: SlowAlertResponseTime
|
||||
expr: histogram_quantile(0.95, rate(alert_item_response_time_seconds_bucket[1h])) > 3600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Slow alert response times"
|
||||
description: "95th percentile alert response time is {{ $value | humanizeDuration }}, exceeding 1 hour."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#response-times"
|
||||
|
||||
# Critical alerts not acknowledged
|
||||
- alert: CriticalAlertsUnacknowledged
|
||||
expr: sum(alert_active_items_current{item_type="alert",severity="urgent"}) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Multiple critical alerts unacknowledged"
|
||||
description: "{{ $value }} critical alerts remain unacknowledged for more than 10 minutes."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#critical-unacked"
|
||||
|
||||
- name: alert_system_capacity
|
||||
rules:
|
||||
# Queue size monitoring
|
||||
- alert: LargeSSEMessageQueues
|
||||
expr: alert_sse_message_queue_size > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
tenant_id: "{{ $labels.tenant_id }}"
|
||||
annotations:
|
||||
summary: "Large SSE message queue for tenant {{ $labels.tenant_id }}"
|
||||
description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages, indicating potential client issues."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-queues"
|
||||
|
||||
# Database storage issues
|
||||
- alert: SlowDatabaseStorage
|
||||
expr: histogram_quantile(0.95, rate(alert_database_storage_duration_seconds_bucket[5m])) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Slow database storage for alerts"
|
||||
description: "95th percentile database storage time is {{ $value }}s, exceeding 1s threshold."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#database-storage"
|
||||
|
||||
- name: alert_system_effectiveness
|
||||
rules:
|
||||
# False positive rate monitoring
|
||||
- alert: HighFalsePositiveRate
|
||||
expr: alert_false_positive_rate > 0.2
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
service: "{{ $labels.service }}"
|
||||
alert_type: "{{ $labels.alert_type }}"
|
||||
annotations:
|
||||
summary: "High false positive rate for {{ $labels.alert_type }}"
|
||||
description: "False positive rate for {{ $labels.alert_type }} in {{ $labels.service }} is {{ $value | humanizePercentage }}."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#false-positives"
|
||||
|
||||
# Low recommendation adoption
|
||||
- alert: LowRecommendationAdoption
|
||||
expr: rate(alert_recommendations_implemented_total[24h]) / rate(alert_items_published_total{item_type="recommendation"}[24h]) < 0.1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: info
|
||||
service: "{{ $labels.service }}"
|
||||
annotations:
|
||||
summary: "Low recommendation adoption rate"
|
||||
description: "Recommendation adoption rate for {{ $labels.service }} is {{ $value | humanizePercentage }} over the last 24 hours."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#recommendation-adoption"
|
||||
|
||||
# Additional alerting rules for specific scenarios
|
||||
- name: alert_system_critical_scenarios
|
||||
rules:
|
||||
# Complete system failure
|
||||
- alert: AlertSystemDown
|
||||
expr: up{job=~"alert-processor|notification-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: "{{ $labels.job }}"
|
||||
annotations:
|
||||
summary: "Alert system service {{ $labels.job }} is down"
|
||||
description: "Critical alert system service {{ $labels.job }} has been down for more than 1 minute."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#service-down"
|
||||
|
||||
# Data loss prevention
|
||||
- alert: AlertDataNotPersisted
|
||||
expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_database_storage_duration_seconds_count[5m]) == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Alert data not being persisted to database"
|
||||
description: "Alerts are being processed but not stored in database, potential data loss."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#data-persistence"
|
||||
|
||||
# Notification blackhole
|
||||
- alert: NotificationsNotDelivered
|
||||
expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_notifications_sent_total[5m]) == 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Notifications not being delivered"
|
||||
description: "Alerts are being processed but no notifications are being sent."
|
||||
runbook_url: "https://docs.bakery.local/runbooks/alert-system#notification-delivery"
|
||||
@@ -1,86 +0,0 @@
|
||||
# infrastructure/monitoring/prometheus/rules/alerts.yml
|
||||
# Prometheus alerting rules
|
||||
|
||||
groups:
|
||||
- name: bakery_services
|
||||
rules:
|
||||
# Service availability alerts
|
||||
- alert: ServiceDown
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service {{ $labels.job }} is down"
|
||||
description: "Service {{ $labels.job }} has been down for more than 2 minutes."
|
||||
|
||||
# High error rate alerts
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate on {{ $labels.job }}"
|
||||
description: "Error rate is {{ $value }} errors per second on {{ $labels.job }}."
|
||||
|
||||
# High response time alerts
|
||||
- alert: HighResponseTime
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High response time on {{ $labels.job }}"
|
||||
description: "95th percentile response time is {{ $value }}s on {{ $labels.job }}."
|
||||
|
||||
# Memory usage alerts
|
||||
- alert: HighMemoryUsage
|
||||
expr: process_resident_memory_bytes / 1024 / 1024 > 500
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.job }}"
|
||||
description: "Memory usage is {{ $value }}MB on {{ $labels.job }}."
|
||||
|
||||
# Database connection alerts
|
||||
- alert: DatabaseConnectionHigh
|
||||
expr: pg_stat_activity_count > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High database connections"
|
||||
description: "Database has {{ $value }} active connections."
|
||||
|
||||
- name: bakery_business
|
||||
rules:
|
||||
# Training job alerts
|
||||
- alert: TrainingJobFailed
|
||||
expr: increase(training_jobs_failed_total[1h]) > 0
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Training job failed"
|
||||
description: "{{ $value }} training jobs have failed in the last hour."
|
||||
|
||||
# Prediction accuracy alerts
|
||||
- alert: LowPredictionAccuracy
|
||||
expr: prediction_accuracy < 0.7
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low prediction accuracy"
|
||||
description: "Prediction accuracy is {{ $value }} for tenant {{ $labels.tenant_id }}."
|
||||
|
||||
# API rate limit alerts
|
||||
- alert: APIRateLimitHit
|
||||
expr: increase(rate_limit_hits_total[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "API rate limit hit frequently"
|
||||
description: "Rate limit has been hit {{ $value }} times in 5 minutes."
|
||||
Reference in New Issue
Block a user