Files
bakery-ia/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards-extended.yaml
2026-01-07 19:12:35 +01:00

950 lines
29 KiB
YAML

apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards-extended
namespace: monitoring
data:
postgresql-dashboard.json: |
{
"dashboard": {
"title": "Bakery IA - PostgreSQL Database",
"tags": ["bakery-ia", "postgresql", "database"],
"timezone": "browser",
"refresh": "30s",
"schemaVersion": 16,
"version": 1,
"panels": [
{
"id": 1,
"title": "Active Connections by Database",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "pg_stat_activity_count{state=\"active\"}",
"legendFormat": "{{datname}} - active"
},
{
"expr": "pg_stat_activity_count{state=\"idle\"}",
"legendFormat": "{{datname}} - idle"
},
{
"expr": "pg_stat_activity_count{state=\"idle in transaction\"}",
"legendFormat": "{{datname}} - idle tx"
}
]
},
{
"id": 2,
"title": "Total Connections",
"type": "stat",
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "sum(pg_stat_activity_count)",
"legendFormat": "Total connections"
}
]
},
{
"id": 3,
"title": "Max Connections",
"type": "stat",
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "pg_settings_max_connections",
"legendFormat": "Max connections"
}
]
},
{
"id": 4,
"title": "Transaction Rate (Commits vs Rollbacks)",
"type": "graph",
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(pg_stat_database_xact_commit[5m])",
"legendFormat": "{{datname}} - commits"
},
{
"expr": "rate(pg_stat_database_xact_rollback[5m])",
"legendFormat": "{{datname}} - rollbacks"
}
]
},
{
"id": 5,
"title": "Cache Hit Ratio",
"type": "graph",
"gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (1 - (sum(rate(pg_stat_io_blocks_read_total[5m])) / (sum(rate(pg_stat_io_blocks_read_total[5m])) + sum(rate(pg_stat_io_blocks_hit_total[5m])))))",
"legendFormat": "Cache hit ratio %"
}
]
},
{
"id": 6,
"title": "Slow Queries (> 30s)",
"type": "table",
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "pg_slow_queries{duration_ms > 30000}",
"format": "table",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {},
"indexByName": {},
"renameByName": {
"query": "Query",
"duration_ms": "Duration (ms)",
"datname": "Database"
}
}
}
]
},
{
"id": 7,
"title": "Dead Tuples by Table",
"type": "graph",
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "pg_stat_user_tables_n_dead_tup",
"legendFormat": "{{schemaname}}.{{relname}}"
}
]
},
{
"id": 8,
"title": "Table Bloat Estimate",
"type": "graph",
"gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (pg_stat_user_tables_n_dead_tup * avg_tuple_size) / (pg_total_relation_size * 8192)",
"legendFormat": "{{schemaname}}.{{relname}} bloat %"
}
]
},
{
"id": 9,
"title": "Replication Lag (bytes)",
"type": "graph",
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
"targets": [
{
"expr": "pg_replication_lag_bytes",
"legendFormat": "{{slot_name}} - {{application_name}}"
}
]
},
{
"id": 10,
"title": "Database Size (GB)",
"type": "graph",
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
"targets": [
{
"expr": "pg_database_size_bytes / 1024 / 1024 / 1024",
"legendFormat": "{{datname}}"
}
]
},
{
"id": 11,
"title": "Database Size Growth (per hour)",
"type": "graph",
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(pg_database_size_bytes[1h])",
"legendFormat": "{{datname}} - bytes/hour"
}
]
},
{
"id": 12,
"title": "Lock Counts by Type",
"type": "graph",
"gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
"targets": [
{
"expr": "pg_locks_count",
"legendFormat": "{{datname}} - {{locktype}} - {{mode}}"
}
]
},
{
"id": 13,
"title": "Query Duration (p95)",
"type": "graph",
"gridPos": {"x": 12, "y": 40, "w": 12, "h": 8},
"targets": [
{
"expr": "histogram_quantile(0.95, rate(pg_query_duration_seconds_bucket[5m]))",
"legendFormat": "p95"
}
]
}
]
}
}
node-exporter-dashboard.json: |
{
"dashboard": {
"title": "Bakery IA - Node Exporter Infrastructure",
"tags": ["bakery-ia", "node-exporter", "infrastructure"],
"timezone": "browser",
"refresh": "15s",
"schemaVersion": 16,
"version": 1,
"panels": [
{
"id": 1,
"title": "CPU Usage by Node",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{instance}} - {{cpu}}"
}
]
},
{
"id": 2,
"title": "Average CPU Usage",
"type": "stat",
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "Average CPU %"
}
]
},
{
"id": 3,
"title": "CPU Load (1m, 5m, 15m)",
"type": "stat",
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "avg(node_load1)",
"legendFormat": "1m"
},
{
"expr": "avg(node_load5)",
"legendFormat": "5m"
},
{
"expr": "avg(node_load15)",
"legendFormat": "15m"
}
]
},
{
"id": 4,
"title": "Memory Usage by Node",
"type": "graph",
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
"legendFormat": "{{instance}}"
}
]
},
{
"id": 5,
"title": "Memory Used (GB)",
"type": "stat",
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
"targets": [
{
"expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024",
"legendFormat": "{{instance}}"
}
]
},
{
"id": 6,
"title": "Memory Available (GB)",
"type": "stat",
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
"targets": [
{
"expr": "node_memory_MemAvailable_bytes / 1024 / 1024 / 1024",
"legendFormat": "{{instance}}"
}
]
},
{
"id": 7,
"title": "Disk I/O Read Rate (MB/s)",
"type": "graph",
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_disk_read_bytes_total[5m]) / 1024 / 1024",
"legendFormat": "{{instance}} - {{device}}"
}
]
},
{
"id": 8,
"title": "Disk I/O Write Rate (MB/s)",
"type": "graph",
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_disk_written_bytes_total[5m]) / 1024 / 1024",
"legendFormat": "{{instance}} - {{device}}"
}
]
},
{
"id": 9,
"title": "Disk I/O Operations (IOPS)",
"type": "graph",
"gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_disk_reads_completed_total[5m]) + rate(node_disk_writes_completed_total[5m])",
"legendFormat": "{{instance}} - {{device}}"
}
]
},
{
"id": 10,
"title": "Network Receive Rate (Mbps)",
"type": "graph",
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
"legendFormat": "{{instance}} - {{device}}"
}
]
},
{
"id": 11,
"title": "Network Transmit Rate (Mbps)",
"type": "graph",
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
"legendFormat": "{{instance}} - {{device}}"
}
]
},
{
"id": 12,
"title": "Network Errors",
"type": "graph",
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_network_receive_errs_total[5m]) + rate(node_network_transmit_errs_total[5m])",
"legendFormat": "{{instance}} - {{device}}"
}
]
},
{
"id": 13,
"title": "Filesystem Usage by Mount",
"type": "graph",
"gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes))",
"legendFormat": "{{instance}} - {{mountpoint}}"
}
]
},
{
"id": 14,
"title": "Filesystem Available (GB)",
"type": "stat",
"gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
"targets": [
{
"expr": "node_filesystem_avail_bytes / 1024 / 1024 / 1024",
"legendFormat": "{{instance}} - {{mountpoint}}"
}
]
},
{
"id": 15,
"title": "Filesystem Size (GB)",
"type": "stat",
"gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
"targets": [
{
"expr": "node_filesystem_size_bytes / 1024 / 1024 / 1024",
"legendFormat": "{{instance}} - {{mountpoint}}"
}
]
},
{
"id": 16,
"title": "Load Average (1m, 5m, 15m)",
"type": "graph",
"gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
"targets": [
{
"expr": "node_load1",
"legendFormat": "{{instance}} - 1m"
},
{
"expr": "node_load5",
"legendFormat": "{{instance}} - 5m"
},
{
"expr": "node_load15",
"legendFormat": "{{instance}} - 15m"
}
]
},
{
"id": 17,
"title": "System Up Time",
"type": "stat",
"gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
"targets": [
{
"expr": "node_boot_time_seconds",
"legendFormat": "{{instance}} - uptime"
}
]
},
{
"id": 18,
"title": "Context Switches",
"type": "graph",
"gridPos": {"x": 0, "y": 56, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_context_switches_total[5m])",
"legendFormat": "{{instance}}"
}
]
},
{
"id": 19,
"title": "Interrupts",
"type": "graph",
"gridPos": {"x": 12, "y": 56, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_intr_total[5m])",
"legendFormat": "{{instance}}"
}
]
}
]
}
}
alertmanager-dashboard.json: |
{
"dashboard": {
"title": "Bakery IA - AlertManager Monitoring",
"tags": ["bakery-ia", "alertmanager", "alerting"],
"timezone": "browser",
"refresh": "10s",
"schemaVersion": 16,
"version": 1,
"panels": [
{
"id": 1,
"title": "Active Alerts by Severity",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "count by (severity) (ALERTS{alertstate=\"firing\"})",
"legendFormat": "{{severity}}"
}
]
},
{
"id": 2,
"title": "Total Active Alerts",
"type": "stat",
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "count(ALERTS{alertstate=\"firing\"})",
"legendFormat": "Active alerts"
}
]
},
{
"id": 3,
"title": "Critical Alerts",
"type": "stat",
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "count(ALERTS{alertstate=\"firing\", severity=\"critical\"})",
"legendFormat": "Critical"
}
]
},
{
"id": 4,
"title": "Alert Firing Rate (per minute)",
"type": "graph",
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(alertmanager_alerts_fired_total[1m])",
"legendFormat": "Alerts fired/min"
}
]
},
{
"id": 5,
"title": "Alert Resolution Rate (per minute)",
"type": "graph",
"gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(alertmanager_alerts_resolved_total[1m])",
"legendFormat": "Alerts resolved/min"
}
]
},
{
"id": 6,
"title": "Notification Success Rate",
"type": "graph",
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (rate(alertmanager_notifications_total{status=\"success\"}[5m]) / rate(alertmanager_notifications_total[5m]))",
"legendFormat": "Success rate %"
}
]
},
{
"id": 7,
"title": "Notification Failures",
"type": "graph",
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(alertmanager_notifications_total{status=\"failed\"}[5m])",
"legendFormat": "{{integration}}"
}
]
},
{
"id": 8,
"title": "Silenced Alerts",
"type": "stat",
"gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
"targets": [
{
"expr": "count(ALERTS{alertstate=\"silenced\"})",
"legendFormat": "Silenced"
}
]
},
{
"id": 9,
"title": "AlertManager Cluster Size",
"type": "stat",
"gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
"targets": [
{
"expr": "count(alertmanager_cluster_peers)",
"legendFormat": "Cluster peers"
}
]
},
{
"id": 10,
"title": "AlertManager Peers",
"type": "stat",
"gridPos": {"x": 12, "y": 24, "w": 6, "h": 4},
"targets": [
{
"expr": "alertmanager_cluster_peers",
"legendFormat": "{{instance}}"
}
]
},
{
"id": 11,
"title": "Cluster Status",
"type": "stat",
"gridPos": {"x": 18, "y": 24, "w": 6, "h": 4},
"targets": [
{
"expr": "up{job=\"alertmanager\"}",
"legendFormat": "{{instance}}"
}
]
},
{
"id": 12,
"title": "Alerts by Group",
"type": "table",
"gridPos": {"x": 0, "y": 28, "w": 12, "h": 8},
"targets": [
{
"expr": "count by (alertname) (ALERTS{alertstate=\"firing\"})",
"format": "table",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {},
"indexByName": {},
"renameByName": {
"alertname": "Alert Name",
"Value": "Count"
}
}
}
]
},
{
"id": 13,
"title": "Alert Duration (p99)",
"type": "graph",
"gridPos": {"x": 12, "y": 28, "w": 12, "h": 8},
"targets": [
{
"expr": "histogram_quantile(0.99, rate(alertmanager_alert_duration_seconds_bucket[5m]))",
"legendFormat": "p99 duration"
}
]
},
{
"id": 14,
"title": "Processing Time",
"type": "graph",
"gridPos": {"x": 0, "y": 36, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(alertmanager_receiver_processing_duration_seconds_sum[5m]) / rate(alertmanager_receiver_processing_duration_seconds_count[5m])",
"legendFormat": "{{receiver}}"
}
]
},
{
"id": 15,
"title": "Memory Usage",
"type": "stat",
"gridPos": {"x": 12, "y": 36, "w": 12, "h": 8},
"targets": [
{
"expr": "process_resident_memory_bytes{job=\"alertmanager\"} / 1024 / 1024",
"legendFormat": "{{instance}} - MB"
}
]
}
]
}
}
business-metrics-dashboard.json: |
{
"dashboard": {
"title": "Bakery IA - Business Metrics & KPIs",
"tags": ["bakery-ia", "business-metrics", "kpis"],
"timezone": "browser",
"refresh": "30s",
"schemaVersion": 16,
"version": 1,
"panels": [
{
"id": 1,
"title": "Requests per Service (Rate)",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "sum by (service) (rate(http_requests_total[5m]))",
"legendFormat": "{{service}}"
}
]
},
{
"id": 2,
"title": "Total Request Rate",
"type": "stat",
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "sum(rate(http_requests_total[5m]))",
"legendFormat": "requests/sec"
}
]
},
{
"id": 3,
"title": "Peak Request Rate (5m)",
"type": "stat",
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "max(sum(rate(http_requests_total[5m])))",
"legendFormat": "Peak requests/sec"
}
]
},
{
"id": 4,
"title": "Error Rates by Service",
"type": "graph",
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))",
"legendFormat": "{{service}}"
}
]
},
{
"id": 5,
"title": "Overall Error Rate",
"type": "stat",
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
"targets": [
{
"expr": "100 * (sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])))",
"legendFormat": "Error %"
}
]
},
{
"id": 6,
"title": "4xx Error Rate",
"type": "stat",
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
"targets": [
{
"expr": "100 * (sum(rate(http_requests_total{status_code=~\"4..\"}[5m])) / sum(rate(http_requests_total[5m])))",
"legendFormat": "4xx %"
}
]
},
{
"id": 7,
"title": "P95 Latency by Service (ms)",
"type": "graph",
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "histogram_quantile(0.95, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
"legendFormat": "{{service}} p95"
}
]
},
{
"id": 8,
"title": "P99 Latency by Service (ms)",
"type": "graph",
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
"legendFormat": "{{service}} p99"
}
]
},
{
"id": 9,
"title": "Average Latency (ms)",
"type": "stat",
"gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
"targets": [
{
"expr": "(sum(rate(http_request_duration_seconds_sum[5m])) / sum(rate(http_request_duration_seconds_count[5m]))) * 1000",
"legendFormat": "Avg latency ms"
}
]
},
{
"id": 10,
"title": "Active Tenants",
"type": "stat",
"gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
"targets": [
{
"expr": "count(count by (tenant_id) (rate(http_requests_total[5m])))",
"legendFormat": "Active tenants"
}
]
},
{
"id": 11,
"title": "Requests per Tenant",
"type": "stat",
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 4},
"targets": [
{
"expr": "sum by (tenant_id) (rate(http_requests_total[5m]))",
"legendFormat": "Tenant {{tenant_id}}"
}
]
},
{
"id": 12,
"title": "Alert Generation Rate (per minute)",
"type": "graph",
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(ALERTS_FOR_STATE[1m])",
"legendFormat": "{{alertname}}"
}
]
},
{
"id": 13,
"title": "Training Job Success Rate",
"type": "stat",
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (sum(training_job_completed_total{status=\"success\"}) / sum(training_job_completed_total))",
"legendFormat": "Success rate %"
}
]
},
{
"id": 14,
"title": "Training Jobs in Progress",
"type": "stat",
"gridPos": {"x": 0, "y": 40, "w": 6, "h": 4},
"targets": [
{
"expr": "count(training_job_in_progress)",
"legendFormat": "Jobs running"
}
]
},
{
"id": 15,
"title": "Training Job Completion Time (p95, minutes)",
"type": "stat",
"gridPos": {"x": 6, "y": 40, "w": 6, "h": 4},
"targets": [
{
"expr": "histogram_quantile(0.95, training_job_duration_seconds) / 60",
"legendFormat": "p95 minutes"
}
]
},
{
"id": 16,
"title": "Failed Training Jobs",
"type": "stat",
"gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
"targets": [
{
"expr": "sum(training_job_completed_total{status=\"failed\"})",
"legendFormat": "Failed jobs"
}
]
},
{
"id": 17,
"title": "Total Training Jobs Completed",
"type": "stat",
"gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
"targets": [
{
"expr": "sum(training_job_completed_total)",
"legendFormat": "Total completed"
}
]
},
{
"id": 18,
"title": "API Health Status",
"type": "table",
"gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
"targets": [
{
"expr": "up{job=\"bakery-services\"}",
"format": "table",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {},
"indexByName": {},
"renameByName": {
"service": "Service",
"Value": "Status",
"instance": "Instance"
}
}
}
]
},
{
"id": 19,
"title": "Service Success Rate (%)",
"type": "graph",
"gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (1 - (sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum by (service) (rate(http_requests_total[5m]))))",
"legendFormat": "{{service}}"
}
]
},
{
"id": 20,
"title": "Requests Processed Today",
"type": "stat",
"gridPos": {"x": 0, "y": 56, "w": 12, "h": 4},
"targets": [
{
"expr": "sum(increase(http_requests_total[24h]))",
"legendFormat": "Requests (24h)"
}
]
},
{
"id": 21,
"title": "Distinct Users Today",
"type": "stat",
"gridPos": {"x": 12, "y": 56, "w": 12, "h": 4},
"targets": [
{
"expr": "count(count by (user_id) (increase(http_requests_total{user_id!=\"\"}[24h])))",
"legendFormat": "Users (24h)"
}
]
}
]
}
}