apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboards-extended namespace: monitoring data: postgresql-dashboard.json: | { "dashboard": { "title": "Bakery IA - PostgreSQL Database", "tags": ["bakery-ia", "postgresql", "database"], "timezone": "browser", "refresh": "30s", "schemaVersion": 16, "version": 1, "panels": [ { "id": 1, "title": "Active Connections by Database", "type": "graph", "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}, "targets": [ { "expr": "pg_stat_activity_count{state=\"active\"}", "legendFormat": "{{datname}} - active" }, { "expr": "pg_stat_activity_count{state=\"idle\"}", "legendFormat": "{{datname}} - idle" }, { "expr": "pg_stat_activity_count{state=\"idle in transaction\"}", "legendFormat": "{{datname}} - idle tx" } ] }, { "id": 2, "title": "Total Connections", "type": "stat", "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4}, "targets": [ { "expr": "sum(pg_stat_activity_count)", "legendFormat": "Total connections" } ] }, { "id": 3, "title": "Max Connections", "type": "stat", "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4}, "targets": [ { "expr": "pg_settings_max_connections", "legendFormat": "Max connections" } ] }, { "id": 4, "title": "Transaction Rate (Commits vs Rollbacks)", "type": "graph", "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8}, "targets": [ { "expr": "rate(pg_stat_database_xact_commit[5m])", "legendFormat": "{{datname}} - commits" }, { "expr": "rate(pg_stat_database_xact_rollback[5m])", "legendFormat": "{{datname}} - rollbacks" } ] }, { "id": 5, "title": "Cache Hit Ratio", "type": "graph", "gridPos": {"x": 12, "y": 8, "w": 12, "h": 8}, "targets": [ { "expr": "100 * (1 - (sum(rate(pg_stat_io_blocks_read_total[5m])) / (sum(rate(pg_stat_io_blocks_read_total[5m])) + sum(rate(pg_stat_io_blocks_hit_total[5m])))))", "legendFormat": "Cache hit ratio %" } ] }, { "id": 6, "title": "Slow Queries (> 30s)", "type": "table", "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8}, "targets": [ { "expr": "pg_slow_queries{duration_ms > 30000}", "format": "table", "instant": true } ], "transformations": [ { "id": "organize", "options": { "excludeByName": {}, "indexByName": {}, "renameByName": { "query": "Query", "duration_ms": "Duration (ms)", "datname": "Database" } } } ] }, { "id": 7, "title": "Dead Tuples by Table", "type": "graph", "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8}, "targets": [ { "expr": "pg_stat_user_tables_n_dead_tup", "legendFormat": "{{schemaname}}.{{relname}}" } ] }, { "id": 8, "title": "Table Bloat Estimate", "type": "graph", "gridPos": {"x": 0, "y": 24, "w": 12, "h": 8}, "targets": [ { "expr": "100 * (pg_stat_user_tables_n_dead_tup * avg_tuple_size) / (pg_total_relation_size * 8192)", "legendFormat": "{{schemaname}}.{{relname}} bloat %" } ] }, { "id": 9, "title": "Replication Lag (bytes)", "type": "graph", "gridPos": {"x": 12, "y": 24, "w": 12, "h": 8}, "targets": [ { "expr": "pg_replication_lag_bytes", "legendFormat": "{{slot_name}} - {{application_name}}" } ] }, { "id": 10, "title": "Database Size (GB)", "type": "graph", "gridPos": {"x": 0, "y": 32, "w": 12, "h": 8}, "targets": [ { "expr": "pg_database_size_bytes / 1024 / 1024 / 1024", "legendFormat": "{{datname}}" } ] }, { "id": 11, "title": "Database Size Growth (per hour)", "type": "graph", "gridPos": {"x": 12, "y": 32, "w": 12, "h": 8}, "targets": [ { "expr": "rate(pg_database_size_bytes[1h])", "legendFormat": "{{datname}} - bytes/hour" } ] }, { "id": 12, "title": "Lock Counts by Type", "type": "graph", "gridPos": {"x": 0, "y": 40, "w": 12, "h": 8}, "targets": [ { "expr": "pg_locks_count", "legendFormat": "{{datname}} - {{locktype}} - {{mode}}" } ] }, { "id": 13, "title": "Query Duration (p95)", "type": "graph", "gridPos": {"x": 12, "y": 40, "w": 12, "h": 8}, "targets": [ { "expr": "histogram_quantile(0.95, rate(pg_query_duration_seconds_bucket[5m]))", "legendFormat": "p95" } ] } ] } } node-exporter-dashboard.json: | { "dashboard": { "title": "Bakery IA - Node Exporter Infrastructure", "tags": ["bakery-ia", "node-exporter", "infrastructure"], "timezone": "browser", "refresh": "15s", "schemaVersion": 16, "version": 1, "panels": [ { "id": 1, "title": "CPU Usage by Node", "type": "graph", "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}, "targets": [ { "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "legendFormat": "{{instance}} - {{cpu}}" } ] }, { "id": 2, "title": "Average CPU Usage", "type": "stat", "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4}, "targets": [ { "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "legendFormat": "Average CPU %" } ] }, { "id": 3, "title": "CPU Load (1m, 5m, 15m)", "type": "stat", "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4}, "targets": [ { "expr": "avg(node_load1)", "legendFormat": "1m" }, { "expr": "avg(node_load5)", "legendFormat": "5m" }, { "expr": "avg(node_load15)", "legendFormat": "15m" } ] }, { "id": 4, "title": "Memory Usage by Node", "type": "graph", "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8}, "targets": [ { "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))", "legendFormat": "{{instance}}" } ] }, { "id": 5, "title": "Memory Used (GB)", "type": "stat", "gridPos": {"x": 12, "y": 8, "w": 6, "h": 4}, "targets": [ { "expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024", "legendFormat": "{{instance}}" } ] }, { "id": 6, "title": "Memory Available (GB)", "type": "stat", "gridPos": {"x": 18, "y": 8, "w": 6, "h": 4}, "targets": [ { "expr": "node_memory_MemAvailable_bytes / 1024 / 1024 / 1024", "legendFormat": "{{instance}}" } ] }, { "id": 7, "title": "Disk I/O Read Rate (MB/s)", "type": "graph", "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8}, "targets": [ { "expr": "rate(node_disk_read_bytes_total[5m]) / 1024 / 1024", "legendFormat": "{{instance}} - {{device}}" } ] }, { "id": 8, "title": "Disk I/O Write Rate (MB/s)", "type": "graph", "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8}, "targets": [ { "expr": "rate(node_disk_written_bytes_total[5m]) / 1024 / 1024", "legendFormat": "{{instance}} - {{device}}" } ] }, { "id": 9, "title": "Disk I/O Operations (IOPS)", "type": "graph", "gridPos": {"x": 0, "y": 24, "w": 12, "h": 8}, "targets": [ { "expr": "rate(node_disk_reads_completed_total[5m]) + rate(node_disk_writes_completed_total[5m])", "legendFormat": "{{instance}} - {{device}}" } ] }, { "id": 10, "title": "Network Receive Rate (Mbps)", "type": "graph", "gridPos": {"x": 12, "y": 24, "w": 12, "h": 8}, "targets": [ { "expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024", "legendFormat": "{{instance}} - {{device}}" } ] }, { "id": 11, "title": "Network Transmit Rate (Mbps)", "type": "graph", "gridPos": {"x": 0, "y": 32, "w": 12, "h": 8}, "targets": [ { "expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024", "legendFormat": "{{instance}} - {{device}}" } ] }, { "id": 12, "title": "Network Errors", "type": "graph", "gridPos": {"x": 12, "y": 32, "w": 12, "h": 8}, "targets": [ { "expr": "rate(node_network_receive_errs_total[5m]) + rate(node_network_transmit_errs_total[5m])", "legendFormat": "{{instance}} - {{device}}" } ] }, { "id": 13, "title": "Filesystem Usage by Mount", "type": "graph", "gridPos": {"x": 0, "y": 40, "w": 12, "h": 8}, "targets": [ { "expr": "100 * (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes))", "legendFormat": "{{instance}} - {{mountpoint}}" } ] }, { "id": 14, "title": "Filesystem Available (GB)", "type": "stat", "gridPos": {"x": 12, "y": 40, "w": 6, "h": 4}, "targets": [ { "expr": "node_filesystem_avail_bytes / 1024 / 1024 / 1024", "legendFormat": "{{instance}} - {{mountpoint}}" } ] }, { "id": 15, "title": "Filesystem Size (GB)", "type": "stat", "gridPos": {"x": 18, "y": 40, "w": 6, "h": 4}, "targets": [ { "expr": "node_filesystem_size_bytes / 1024 / 1024 / 1024", "legendFormat": "{{instance}} - {{mountpoint}}" } ] }, { "id": 16, "title": "Load Average (1m, 5m, 15m)", "type": "graph", "gridPos": {"x": 0, "y": 48, "w": 12, "h": 8}, "targets": [ { "expr": "node_load1", "legendFormat": "{{instance}} - 1m" }, { "expr": "node_load5", "legendFormat": "{{instance}} - 5m" }, { "expr": "node_load15", "legendFormat": "{{instance}} - 15m" } ] }, { "id": 17, "title": "System Up Time", "type": "stat", "gridPos": {"x": 12, "y": 48, "w": 12, "h": 8}, "targets": [ { "expr": "node_boot_time_seconds", "legendFormat": "{{instance}} - uptime" } ] }, { "id": 18, "title": "Context Switches", "type": "graph", "gridPos": {"x": 0, "y": 56, "w": 12, "h": 8}, "targets": [ { "expr": "rate(node_context_switches_total[5m])", "legendFormat": "{{instance}}" } ] }, { "id": 19, "title": "Interrupts", "type": "graph", "gridPos": {"x": 12, "y": 56, "w": 12, "h": 8}, "targets": [ { "expr": "rate(node_intr_total[5m])", "legendFormat": "{{instance}}" } ] } ] } } alertmanager-dashboard.json: | { "dashboard": { "title": "Bakery IA - AlertManager Monitoring", "tags": ["bakery-ia", "alertmanager", "alerting"], "timezone": "browser", "refresh": "10s", "schemaVersion": 16, "version": 1, "panels": [ { "id": 1, "title": "Active Alerts by Severity", "type": "graph", "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}, "targets": [ { "expr": "count by (severity) (ALERTS{alertstate=\"firing\"})", "legendFormat": "{{severity}}" } ] }, { "id": 2, "title": "Total Active Alerts", "type": "stat", "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4}, "targets": [ { "expr": "count(ALERTS{alertstate=\"firing\"})", "legendFormat": "Active alerts" } ] }, { "id": 3, "title": "Critical Alerts", "type": "stat", "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4}, "targets": [ { "expr": "count(ALERTS{alertstate=\"firing\", severity=\"critical\"})", "legendFormat": "Critical" } ] }, { "id": 4, "title": "Alert Firing Rate (per minute)", "type": "graph", "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8}, "targets": [ { "expr": "rate(alertmanager_alerts_fired_total[1m])", "legendFormat": "Alerts fired/min" } ] }, { "id": 5, "title": "Alert Resolution Rate (per minute)", "type": "graph", "gridPos": {"x": 12, "y": 8, "w": 12, "h": 8}, "targets": [ { "expr": "rate(alertmanager_alerts_resolved_total[1m])", "legendFormat": "Alerts resolved/min" } ] }, { "id": 6, "title": "Notification Success Rate", "type": "graph", "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8}, "targets": [ { "expr": "100 * (rate(alertmanager_notifications_total{status=\"success\"}[5m]) / rate(alertmanager_notifications_total[5m]))", "legendFormat": "Success rate %" } ] }, { "id": 7, "title": "Notification Failures", "type": "graph", "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8}, "targets": [ { "expr": "rate(alertmanager_notifications_total{status=\"failed\"}[5m])", "legendFormat": "{{integration}}" } ] }, { "id": 8, "title": "Silenced Alerts", "type": "stat", "gridPos": {"x": 0, "y": 24, "w": 6, "h": 4}, "targets": [ { "expr": "count(ALERTS{alertstate=\"silenced\"})", "legendFormat": "Silenced" } ] }, { "id": 9, "title": "AlertManager Cluster Size", "type": "stat", "gridPos": {"x": 6, "y": 24, "w": 6, "h": 4}, "targets": [ { "expr": "count(alertmanager_cluster_peers)", "legendFormat": "Cluster peers" } ] }, { "id": 10, "title": "AlertManager Peers", "type": "stat", "gridPos": {"x": 12, "y": 24, "w": 6, "h": 4}, "targets": [ { "expr": "alertmanager_cluster_peers", "legendFormat": "{{instance}}" } ] }, { "id": 11, "title": "Cluster Status", "type": "stat", "gridPos": {"x": 18, "y": 24, "w": 6, "h": 4}, "targets": [ { "expr": "up{job=\"alertmanager\"}", "legendFormat": "{{instance}}" } ] }, { "id": 12, "title": "Alerts by Group", "type": "table", "gridPos": {"x": 0, "y": 28, "w": 12, "h": 8}, "targets": [ { "expr": "count by (alertname) (ALERTS{alertstate=\"firing\"})", "format": "table", "instant": true } ], "transformations": [ { "id": "organize", "options": { "excludeByName": {}, "indexByName": {}, "renameByName": { "alertname": "Alert Name", "Value": "Count" } } } ] }, { "id": 13, "title": "Alert Duration (p99)", "type": "graph", "gridPos": {"x": 12, "y": 28, "w": 12, "h": 8}, "targets": [ { "expr": "histogram_quantile(0.99, rate(alertmanager_alert_duration_seconds_bucket[5m]))", "legendFormat": "p99 duration" } ] }, { "id": 14, "title": "Processing Time", "type": "graph", "gridPos": {"x": 0, "y": 36, "w": 12, "h": 8}, "targets": [ { "expr": "rate(alertmanager_receiver_processing_duration_seconds_sum[5m]) / rate(alertmanager_receiver_processing_duration_seconds_count[5m])", "legendFormat": "{{receiver}}" } ] }, { "id": 15, "title": "Memory Usage", "type": "stat", "gridPos": {"x": 12, "y": 36, "w": 12, "h": 8}, "targets": [ { "expr": "process_resident_memory_bytes{job=\"alertmanager\"} / 1024 / 1024", "legendFormat": "{{instance}} - MB" } ] } ] } } business-metrics-dashboard.json: | { "dashboard": { "title": "Bakery IA - Business Metrics & KPIs", "tags": ["bakery-ia", "business-metrics", "kpis"], "timezone": "browser", "refresh": "30s", "schemaVersion": 16, "version": 1, "panels": [ { "id": 1, "title": "Requests per Service (Rate)", "type": "graph", "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}, "targets": [ { "expr": "sum by (service) (rate(http_requests_total[5m]))", "legendFormat": "{{service}}" } ] }, { "id": 2, "title": "Total Request Rate", "type": "stat", "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4}, "targets": [ { "expr": "sum(rate(http_requests_total[5m]))", "legendFormat": "requests/sec" } ] }, { "id": 3, "title": "Peak Request Rate (5m)", "type": "stat", "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4}, "targets": [ { "expr": "max(sum(rate(http_requests_total[5m])))", "legendFormat": "Peak requests/sec" } ] }, { "id": 4, "title": "Error Rates by Service", "type": "graph", "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8}, "targets": [ { "expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))", "legendFormat": "{{service}}" } ] }, { "id": 5, "title": "Overall Error Rate", "type": "stat", "gridPos": {"x": 12, "y": 8, "w": 6, "h": 4}, "targets": [ { "expr": "100 * (sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])))", "legendFormat": "Error %" } ] }, { "id": 6, "title": "4xx Error Rate", "type": "stat", "gridPos": {"x": 18, "y": 8, "w": 6, "h": 4}, "targets": [ { "expr": "100 * (sum(rate(http_requests_total{status_code=~\"4..\"}[5m])) / sum(rate(http_requests_total[5m])))", "legendFormat": "4xx %" } ] }, { "id": 7, "title": "P95 Latency by Service (ms)", "type": "graph", "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8}, "targets": [ { "expr": "histogram_quantile(0.95, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000", "legendFormat": "{{service}} p95" } ] }, { "id": 8, "title": "P99 Latency by Service (ms)", "type": "graph", "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8}, "targets": [ { "expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000", "legendFormat": "{{service}} p99" } ] }, { "id": 9, "title": "Average Latency (ms)", "type": "stat", "gridPos": {"x": 0, "y": 24, "w": 6, "h": 4}, "targets": [ { "expr": "(sum(rate(http_request_duration_seconds_sum[5m])) / sum(rate(http_request_duration_seconds_count[5m]))) * 1000", "legendFormat": "Avg latency ms" } ] }, { "id": 10, "title": "Active Tenants", "type": "stat", "gridPos": {"x": 6, "y": 24, "w": 6, "h": 4}, "targets": [ { "expr": "count(count by (tenant_id) (rate(http_requests_total[5m])))", "legendFormat": "Active tenants" } ] }, { "id": 11, "title": "Requests per Tenant", "type": "stat", "gridPos": {"x": 12, "y": 24, "w": 12, "h": 4}, "targets": [ { "expr": "sum by (tenant_id) (rate(http_requests_total[5m]))", "legendFormat": "Tenant {{tenant_id}}" } ] }, { "id": 12, "title": "Alert Generation Rate (per minute)", "type": "graph", "gridPos": {"x": 0, "y": 32, "w": 12, "h": 8}, "targets": [ { "expr": "rate(ALERTS_FOR_STATE[1m])", "legendFormat": "{{alertname}}" } ] }, { "id": 13, "title": "Training Job Success Rate", "type": "stat", "gridPos": {"x": 12, "y": 32, "w": 12, "h": 8}, "targets": [ { "expr": "100 * (sum(training_job_completed_total{status=\"success\"}) / sum(training_job_completed_total))", "legendFormat": "Success rate %" } ] }, { "id": 14, "title": "Training Jobs in Progress", "type": "stat", "gridPos": {"x": 0, "y": 40, "w": 6, "h": 4}, "targets": [ { "expr": "count(training_job_in_progress)", "legendFormat": "Jobs running" } ] }, { "id": 15, "title": "Training Job Completion Time (p95, minutes)", "type": "stat", "gridPos": {"x": 6, "y": 40, "w": 6, "h": 4}, "targets": [ { "expr": "histogram_quantile(0.95, training_job_duration_seconds) / 60", "legendFormat": "p95 minutes" } ] }, { "id": 16, "title": "Failed Training Jobs", "type": "stat", "gridPos": {"x": 12, "y": 40, "w": 6, "h": 4}, "targets": [ { "expr": "sum(training_job_completed_total{status=\"failed\"})", "legendFormat": "Failed jobs" } ] }, { "id": 17, "title": "Total Training Jobs Completed", "type": "stat", "gridPos": {"x": 18, "y": 40, "w": 6, "h": 4}, "targets": [ { "expr": "sum(training_job_completed_total)", "legendFormat": "Total completed" } ] }, { "id": 18, "title": "API Health Status", "type": "table", "gridPos": {"x": 0, "y": 48, "w": 12, "h": 8}, "targets": [ { "expr": "up{job=\"bakery-services\"}", "format": "table", "instant": true } ], "transformations": [ { "id": "organize", "options": { "excludeByName": {}, "indexByName": {}, "renameByName": { "service": "Service", "Value": "Status", "instance": "Instance" } } } ] }, { "id": 19, "title": "Service Success Rate (%)", "type": "graph", "gridPos": {"x": 12, "y": 48, "w": 12, "h": 8}, "targets": [ { "expr": "100 * (1 - (sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum by (service) (rate(http_requests_total[5m]))))", "legendFormat": "{{service}}" } ] }, { "id": 20, "title": "Requests Processed Today", "type": "stat", "gridPos": {"x": 0, "y": 56, "w": 12, "h": 4}, "targets": [ { "expr": "sum(increase(http_requests_total[24h]))", "legendFormat": "Requests (24h)" } ] }, { "id": 21, "title": "Distinct Users Today", "type": "stat", "gridPos": {"x": 12, "y": 56, "w": 12, "h": 4}, "targets": [ { "expr": "count(count by (user_id) (increase(http_requests_total{user_id!=\"\"}[24h])))", "legendFormat": "Users (24h)" } ] } ] } }