950 lines
29 KiB
YAML
950 lines
29 KiB
YAML
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboards-extended
|
|
namespace: monitoring
|
|
data:
|
|
postgresql-dashboard.json: |
|
|
{
|
|
"dashboard": {
|
|
"title": "Bakery IA - PostgreSQL Database",
|
|
"tags": ["bakery-ia", "postgresql", "database"],
|
|
"timezone": "browser",
|
|
"refresh": "30s",
|
|
"schemaVersion": 16,
|
|
"version": 1,
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"title": "Active Connections by Database",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "pg_stat_activity_count{state=\"active\"}",
|
|
"legendFormat": "{{datname}} - active"
|
|
},
|
|
{
|
|
"expr": "pg_stat_activity_count{state=\"idle\"}",
|
|
"legendFormat": "{{datname}} - idle"
|
|
},
|
|
{
|
|
"expr": "pg_stat_activity_count{state=\"idle in transaction\"}",
|
|
"legendFormat": "{{datname}} - idle tx"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 2,
|
|
"title": "Total Connections",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "sum(pg_stat_activity_count)",
|
|
"legendFormat": "Total connections"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 3,
|
|
"title": "Max Connections",
|
|
"type": "stat",
|
|
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "pg_settings_max_connections",
|
|
"legendFormat": "Max connections"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 4,
|
|
"title": "Transaction Rate (Commits vs Rollbacks)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(pg_stat_database_xact_commit[5m])",
|
|
"legendFormat": "{{datname}} - commits"
|
|
},
|
|
{
|
|
"expr": "rate(pg_stat_database_xact_rollback[5m])",
|
|
"legendFormat": "{{datname}} - rollbacks"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 5,
|
|
"title": "Cache Hit Ratio",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "100 * (1 - (sum(rate(pg_stat_io_blocks_read_total[5m])) / (sum(rate(pg_stat_io_blocks_read_total[5m])) + sum(rate(pg_stat_io_blocks_hit_total[5m])))))",
|
|
"legendFormat": "Cache hit ratio %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 6,
|
|
"title": "Slow Queries (> 30s)",
|
|
"type": "table",
|
|
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "pg_slow_queries{duration_ms > 30000}",
|
|
"format": "table",
|
|
"instant": true
|
|
}
|
|
],
|
|
"transformations": [
|
|
{
|
|
"id": "organize",
|
|
"options": {
|
|
"excludeByName": {},
|
|
"indexByName": {},
|
|
"renameByName": {
|
|
"query": "Query",
|
|
"duration_ms": "Duration (ms)",
|
|
"datname": "Database"
|
|
}
|
|
}
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 7,
|
|
"title": "Dead Tuples by Table",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "pg_stat_user_tables_n_dead_tup",
|
|
"legendFormat": "{{schemaname}}.{{relname}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 8,
|
|
"title": "Table Bloat Estimate",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "100 * (pg_stat_user_tables_n_dead_tup * avg_tuple_size) / (pg_total_relation_size * 8192)",
|
|
"legendFormat": "{{schemaname}}.{{relname}} bloat %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 9,
|
|
"title": "Replication Lag (bytes)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "pg_replication_lag_bytes",
|
|
"legendFormat": "{{slot_name}} - {{application_name}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 10,
|
|
"title": "Database Size (GB)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "pg_database_size_bytes / 1024 / 1024 / 1024",
|
|
"legendFormat": "{{datname}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 11,
|
|
"title": "Database Size Growth (per hour)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(pg_database_size_bytes[1h])",
|
|
"legendFormat": "{{datname}} - bytes/hour"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 12,
|
|
"title": "Lock Counts by Type",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "pg_locks_count",
|
|
"legendFormat": "{{datname}} - {{locktype}} - {{mode}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 13,
|
|
"title": "Query Duration (p95)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 40, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(pg_query_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "p95"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
}
|
|
|
|
node-exporter-dashboard.json: |
|
|
{
|
|
"dashboard": {
|
|
"title": "Bakery IA - Node Exporter Infrastructure",
|
|
"tags": ["bakery-ia", "node-exporter", "infrastructure"],
|
|
"timezone": "browser",
|
|
"refresh": "15s",
|
|
"schemaVersion": 16,
|
|
"version": 1,
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"title": "CPU Usage by Node",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
|
"legendFormat": "{{instance}} - {{cpu}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 2,
|
|
"title": "Average CPU Usage",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
|
"legendFormat": "Average CPU %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 3,
|
|
"title": "CPU Load (1m, 5m, 15m)",
|
|
"type": "stat",
|
|
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "avg(node_load1)",
|
|
"legendFormat": "1m"
|
|
},
|
|
{
|
|
"expr": "avg(node_load5)",
|
|
"legendFormat": "5m"
|
|
},
|
|
{
|
|
"expr": "avg(node_load15)",
|
|
"legendFormat": "15m"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 4,
|
|
"title": "Memory Usage by Node",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
|
"legendFormat": "{{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 5,
|
|
"title": "Memory Used (GB)",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024",
|
|
"legendFormat": "{{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 6,
|
|
"title": "Memory Available (GB)",
|
|
"type": "stat",
|
|
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "node_memory_MemAvailable_bytes / 1024 / 1024 / 1024",
|
|
"legendFormat": "{{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 7,
|
|
"title": "Disk I/O Read Rate (MB/s)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(node_disk_read_bytes_total[5m]) / 1024 / 1024",
|
|
"legendFormat": "{{instance}} - {{device}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 8,
|
|
"title": "Disk I/O Write Rate (MB/s)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(node_disk_written_bytes_total[5m]) / 1024 / 1024",
|
|
"legendFormat": "{{instance}} - {{device}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 9,
|
|
"title": "Disk I/O Operations (IOPS)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(node_disk_reads_completed_total[5m]) + rate(node_disk_writes_completed_total[5m])",
|
|
"legendFormat": "{{instance}} - {{device}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 10,
|
|
"title": "Network Receive Rate (Mbps)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
|
|
"legendFormat": "{{instance}} - {{device}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 11,
|
|
"title": "Network Transmit Rate (Mbps)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
|
|
"legendFormat": "{{instance}} - {{device}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 12,
|
|
"title": "Network Errors",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(node_network_receive_errs_total[5m]) + rate(node_network_transmit_errs_total[5m])",
|
|
"legendFormat": "{{instance}} - {{device}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 13,
|
|
"title": "Filesystem Usage by Mount",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "100 * (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes))",
|
|
"legendFormat": "{{instance}} - {{mountpoint}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 14,
|
|
"title": "Filesystem Available (GB)",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "node_filesystem_avail_bytes / 1024 / 1024 / 1024",
|
|
"legendFormat": "{{instance}} - {{mountpoint}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 15,
|
|
"title": "Filesystem Size (GB)",
|
|
"type": "stat",
|
|
"gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "node_filesystem_size_bytes / 1024 / 1024 / 1024",
|
|
"legendFormat": "{{instance}} - {{mountpoint}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 16,
|
|
"title": "Load Average (1m, 5m, 15m)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "node_load1",
|
|
"legendFormat": "{{instance}} - 1m"
|
|
},
|
|
{
|
|
"expr": "node_load5",
|
|
"legendFormat": "{{instance}} - 5m"
|
|
},
|
|
{
|
|
"expr": "node_load15",
|
|
"legendFormat": "{{instance}} - 15m"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 17,
|
|
"title": "System Up Time",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "node_boot_time_seconds",
|
|
"legendFormat": "{{instance}} - uptime"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 18,
|
|
"title": "Context Switches",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 56, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(node_context_switches_total[5m])",
|
|
"legendFormat": "{{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 19,
|
|
"title": "Interrupts",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 56, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(node_intr_total[5m])",
|
|
"legendFormat": "{{instance}}"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
}
|
|
|
|
alertmanager-dashboard.json: |
|
|
{
|
|
"dashboard": {
|
|
"title": "Bakery IA - AlertManager Monitoring",
|
|
"tags": ["bakery-ia", "alertmanager", "alerting"],
|
|
"timezone": "browser",
|
|
"refresh": "10s",
|
|
"schemaVersion": 16,
|
|
"version": 1,
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"title": "Active Alerts by Severity",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "count by (severity) (ALERTS{alertstate=\"firing\"})",
|
|
"legendFormat": "{{severity}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 2,
|
|
"title": "Total Active Alerts",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "count(ALERTS{alertstate=\"firing\"})",
|
|
"legendFormat": "Active alerts"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 3,
|
|
"title": "Critical Alerts",
|
|
"type": "stat",
|
|
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "count(ALERTS{alertstate=\"firing\", severity=\"critical\"})",
|
|
"legendFormat": "Critical"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 4,
|
|
"title": "Alert Firing Rate (per minute)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(alertmanager_alerts_fired_total[1m])",
|
|
"legendFormat": "Alerts fired/min"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 5,
|
|
"title": "Alert Resolution Rate (per minute)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(alertmanager_alerts_resolved_total[1m])",
|
|
"legendFormat": "Alerts resolved/min"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 6,
|
|
"title": "Notification Success Rate",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "100 * (rate(alertmanager_notifications_total{status=\"success\"}[5m]) / rate(alertmanager_notifications_total[5m]))",
|
|
"legendFormat": "Success rate %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 7,
|
|
"title": "Notification Failures",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(alertmanager_notifications_total{status=\"failed\"}[5m])",
|
|
"legendFormat": "{{integration}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 8,
|
|
"title": "Silenced Alerts",
|
|
"type": "stat",
|
|
"gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "count(ALERTS{alertstate=\"silenced\"})",
|
|
"legendFormat": "Silenced"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 9,
|
|
"title": "AlertManager Cluster Size",
|
|
"type": "stat",
|
|
"gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "count(alertmanager_cluster_peers)",
|
|
"legendFormat": "Cluster peers"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 10,
|
|
"title": "AlertManager Peers",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 24, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "alertmanager_cluster_peers",
|
|
"legendFormat": "{{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 11,
|
|
"title": "Cluster Status",
|
|
"type": "stat",
|
|
"gridPos": {"x": 18, "y": 24, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "up{job=\"alertmanager\"}",
|
|
"legendFormat": "{{instance}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 12,
|
|
"title": "Alerts by Group",
|
|
"type": "table",
|
|
"gridPos": {"x": 0, "y": 28, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "count by (alertname) (ALERTS{alertstate=\"firing\"})",
|
|
"format": "table",
|
|
"instant": true
|
|
}
|
|
],
|
|
"transformations": [
|
|
{
|
|
"id": "organize",
|
|
"options": {
|
|
"excludeByName": {},
|
|
"indexByName": {},
|
|
"renameByName": {
|
|
"alertname": "Alert Name",
|
|
"Value": "Count"
|
|
}
|
|
}
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 13,
|
|
"title": "Alert Duration (p99)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 28, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.99, rate(alertmanager_alert_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "p99 duration"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 14,
|
|
"title": "Processing Time",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 36, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(alertmanager_receiver_processing_duration_seconds_sum[5m]) / rate(alertmanager_receiver_processing_duration_seconds_count[5m])",
|
|
"legendFormat": "{{receiver}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 15,
|
|
"title": "Memory Usage",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 36, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "process_resident_memory_bytes{job=\"alertmanager\"} / 1024 / 1024",
|
|
"legendFormat": "{{instance}} - MB"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
}
|
|
|
|
business-metrics-dashboard.json: |
|
|
{
|
|
"dashboard": {
|
|
"title": "Bakery IA - Business Metrics & KPIs",
|
|
"tags": ["bakery-ia", "business-metrics", "kpis"],
|
|
"timezone": "browser",
|
|
"refresh": "30s",
|
|
"schemaVersion": 16,
|
|
"version": 1,
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"title": "Requests per Service (Rate)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "sum by (service) (rate(http_requests_total[5m]))",
|
|
"legendFormat": "{{service}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 2,
|
|
"title": "Total Request Rate",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(http_requests_total[5m]))",
|
|
"legendFormat": "requests/sec"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 3,
|
|
"title": "Peak Request Rate (5m)",
|
|
"type": "stat",
|
|
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "max(sum(rate(http_requests_total[5m])))",
|
|
"legendFormat": "Peak requests/sec"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 4,
|
|
"title": "Error Rates by Service",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))",
|
|
"legendFormat": "{{service}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 5,
|
|
"title": "Overall Error Rate",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "100 * (sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])))",
|
|
"legendFormat": "Error %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 6,
|
|
"title": "4xx Error Rate",
|
|
"type": "stat",
|
|
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "100 * (sum(rate(http_requests_total{status_code=~\"4..\"}[5m])) / sum(rate(http_requests_total[5m])))",
|
|
"legendFormat": "4xx %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 7,
|
|
"title": "P95 Latency by Service (ms)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
|
|
"legendFormat": "{{service}} p95"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 8,
|
|
"title": "P99 Latency by Service (ms)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
|
|
"legendFormat": "{{service}} p99"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 9,
|
|
"title": "Average Latency (ms)",
|
|
"type": "stat",
|
|
"gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "(sum(rate(http_request_duration_seconds_sum[5m])) / sum(rate(http_request_duration_seconds_count[5m]))) * 1000",
|
|
"legendFormat": "Avg latency ms"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 10,
|
|
"title": "Active Tenants",
|
|
"type": "stat",
|
|
"gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "count(count by (tenant_id) (rate(http_requests_total[5m])))",
|
|
"legendFormat": "Active tenants"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 11,
|
|
"title": "Requests per Tenant",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "sum by (tenant_id) (rate(http_requests_total[5m]))",
|
|
"legendFormat": "Tenant {{tenant_id}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 12,
|
|
"title": "Alert Generation Rate (per minute)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(ALERTS_FOR_STATE[1m])",
|
|
"legendFormat": "{{alertname}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 13,
|
|
"title": "Training Job Success Rate",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "100 * (sum(training_job_completed_total{status=\"success\"}) / sum(training_job_completed_total))",
|
|
"legendFormat": "Success rate %"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 14,
|
|
"title": "Training Jobs in Progress",
|
|
"type": "stat",
|
|
"gridPos": {"x": 0, "y": 40, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "count(training_job_in_progress)",
|
|
"legendFormat": "Jobs running"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 15,
|
|
"title": "Training Job Completion Time (p95, minutes)",
|
|
"type": "stat",
|
|
"gridPos": {"x": 6, "y": 40, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, training_job_duration_seconds) / 60",
|
|
"legendFormat": "p95 minutes"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 16,
|
|
"title": "Failed Training Jobs",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "sum(training_job_completed_total{status=\"failed\"})",
|
|
"legendFormat": "Failed jobs"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 17,
|
|
"title": "Total Training Jobs Completed",
|
|
"type": "stat",
|
|
"gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "sum(training_job_completed_total)",
|
|
"legendFormat": "Total completed"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 18,
|
|
"title": "API Health Status",
|
|
"type": "table",
|
|
"gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "up{job=\"bakery-services\"}",
|
|
"format": "table",
|
|
"instant": true
|
|
}
|
|
],
|
|
"transformations": [
|
|
{
|
|
"id": "organize",
|
|
"options": {
|
|
"excludeByName": {},
|
|
"indexByName": {},
|
|
"renameByName": {
|
|
"service": "Service",
|
|
"Value": "Status",
|
|
"instance": "Instance"
|
|
}
|
|
}
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 19,
|
|
"title": "Service Success Rate (%)",
|
|
"type": "graph",
|
|
"gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
|
|
"targets": [
|
|
{
|
|
"expr": "100 * (1 - (sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum by (service) (rate(http_requests_total[5m]))))",
|
|
"legendFormat": "{{service}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 20,
|
|
"title": "Requests Processed Today",
|
|
"type": "stat",
|
|
"gridPos": {"x": 0, "y": 56, "w": 12, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "sum(increase(http_requests_total[24h]))",
|
|
"legendFormat": "Requests (24h)"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"id": 21,
|
|
"title": "Distinct Users Today",
|
|
"type": "stat",
|
|
"gridPos": {"x": 12, "y": 56, "w": 12, "h": 4},
|
|
"targets": [
|
|
{
|
|
"expr": "count(count by (user_id) (increase(http_requests_total{user_id!=\"\"}[24h])))",
|
|
"legendFormat": "Users (24h)"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
}
|