Add signoz
This commit is contained in:
@@ -48,6 +48,9 @@ spec:
|
||||
name: pos-integration-secrets
|
||||
- secretRef:
|
||||
name: whatsapp-secrets
|
||||
env:
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://otel-collector.monitoring.svc.cluster.local:4317"
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
|
||||
@@ -1,429 +0,0 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-alert-rules
|
||||
namespace: monitoring
|
||||
data:
|
||||
alert-rules.yml: |
|
||||
groups:
|
||||
# Basic Infrastructure Alerts
|
||||
- name: bakery_services
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: ServiceDown
|
||||
expr: up{job="bakery-services"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: infrastructure
|
||||
annotations:
|
||||
summary: "Service {{ $labels.service }} is down"
|
||||
description: "Service {{ $labels.service }} in namespace {{ $labels.namespace }} has been down for more than 2 minutes."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/ServiceDown"
|
||||
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{status_code=~"5..", job="bakery-services"}[5m])) by (service)
|
||||
/
|
||||
sum(rate(http_requests_total{job="bakery-services"}[5m])) by (service)
|
||||
) > 0.10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
component: application
|
||||
annotations:
|
||||
summary: "High error rate on {{ $labels.service }}"
|
||||
description: "Service {{ $labels.service }} has error rate above 10% (current: {{ $value | humanizePercentage }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/HighErrorRate"
|
||||
|
||||
- alert: HighResponseTime
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_request_duration_seconds_bucket{job="bakery-services"}[5m])) by (service, le)
|
||||
) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: performance
|
||||
annotations:
|
||||
summary: "High response time on {{ $labels.service }}"
|
||||
description: "Service {{ $labels.service }} P95 latency is above 1 second (current: {{ $value }}s)."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/HighResponseTime"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: |
|
||||
container_memory_usage_bytes{namespace="bakery-ia", container!=""} > 500000000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: infrastructure
|
||||
annotations:
|
||||
summary: "High memory usage in {{ $labels.pod }}"
|
||||
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using more than 500MB of memory (current: {{ $value | humanize }}B)."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/HighMemoryUsage"
|
||||
|
||||
- alert: DatabaseConnectionHigh
|
||||
expr: |
|
||||
pg_stat_database_numbackends{datname="bakery"} > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: database
|
||||
annotations:
|
||||
summary: "High database connection count"
|
||||
description: "Database has more than 80 active connections (current: {{ $value }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/DatabaseConnectionHigh"
|
||||
|
||||
# Business Logic Alerts
|
||||
- name: bakery_business
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: TrainingJobFailed
|
||||
expr: |
|
||||
increase(training_job_failures_total[1h]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: ml-training
|
||||
annotations:
|
||||
summary: "Training job failures detected"
|
||||
description: "{{ $value }} training job(s) failed in the last hour."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/TrainingJobFailed"
|
||||
|
||||
- alert: LowPredictionAccuracy
|
||||
expr: |
|
||||
prediction_model_accuracy < 0.70
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
component: ml-inference
|
||||
annotations:
|
||||
summary: "Model prediction accuracy is low"
|
||||
description: "Model {{ $labels.model_name }} accuracy is below 70% (current: {{ $value | humanizePercentage }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/LowPredictionAccuracy"
|
||||
|
||||
- alert: APIRateLimitHit
|
||||
expr: |
|
||||
increase(rate_limit_hits_total[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
component: api-gateway
|
||||
annotations:
|
||||
summary: "API rate limits being hit frequently"
|
||||
description: "Rate limits hit {{ $value }} times in the last 5 minutes."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/APIRateLimitHit"
|
||||
|
||||
# Alert System Health
|
||||
- name: alert_system_health
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: AlertSystemComponentDown
|
||||
expr: |
|
||||
alert_system_component_health{component=~"processor|notifier|scheduler"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Alert system component {{ $labels.component }} is unhealthy"
|
||||
description: "Component {{ $labels.component }} has been unhealthy for more than 2 minutes."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/AlertSystemComponentDown"
|
||||
|
||||
- alert: RabbitMQConnectionDown
|
||||
expr: |
|
||||
rabbitmq_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "RabbitMQ connection is down"
|
||||
description: "Alert system has lost connection to RabbitMQ message queue."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/RabbitMQConnectionDown"
|
||||
|
||||
- alert: RedisConnectionDown
|
||||
expr: |
|
||||
redis_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Redis connection is down"
|
||||
description: "Alert system has lost connection to Redis cache."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/RedisConnectionDown"
|
||||
|
||||
- alert: NoSchedulerLeader
|
||||
expr: |
|
||||
sum(alert_system_scheduler_leader) == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "No alert scheduler leader elected"
|
||||
description: "No scheduler instance has been elected as leader for 5 minutes."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/NoSchedulerLeader"
|
||||
|
||||
# Alert System Performance
|
||||
- name: alert_system_performance
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighAlertProcessingErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(alert_processing_errors_total[2m]))
|
||||
/
|
||||
sum(rate(alerts_processed_total[2m]))
|
||||
) > 0.10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "High alert processing error rate"
|
||||
description: "Alert processing error rate is above 10% (current: {{ $value | humanizePercentage }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingErrorRate"
|
||||
|
||||
- alert: HighNotificationDeliveryFailureRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(notification_delivery_failures_total[3m]))
|
||||
/
|
||||
sum(rate(notifications_sent_total[3m]))
|
||||
) > 0.05
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "High notification delivery failure rate"
|
||||
description: "Notification delivery failure rate is above 5% (current: {{ $value | humanizePercentage }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/HighNotificationDeliveryFailureRate"
|
||||
|
||||
- alert: HighAlertProcessingLatency
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(alert_processing_duration_seconds_bucket[5m])) by (le)
|
||||
) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "High alert processing latency"
|
||||
description: "P95 alert processing latency is above 5 seconds (current: {{ $value }}s)."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingLatency"
|
||||
|
||||
- alert: TooManySSEConnections
|
||||
expr: |
|
||||
sse_active_connections > 1000
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Too many active SSE connections"
|
||||
description: "More than 1000 active SSE connections (current: {{ $value }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/TooManySSEConnections"
|
||||
|
||||
- alert: SSEConnectionErrors
|
||||
expr: |
|
||||
rate(sse_connection_errors_total[3m]) > 0.5
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "High rate of SSE connection errors"
|
||||
description: "SSE connection error rate is {{ $value }} errors/sec."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/SSEConnectionErrors"
|
||||
|
||||
# Alert System Business Logic
|
||||
- name: alert_system_business
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: UnusuallyHighAlertVolume
|
||||
expr: |
|
||||
rate(alerts_generated_total[5m]) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Unusually high alert generation volume"
|
||||
description: "More than 2 alerts per second being generated (current: {{ $value }}/sec)."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/UnusuallyHighAlertVolume"
|
||||
|
||||
- alert: NoAlertsGenerated
|
||||
expr: |
|
||||
rate(alerts_generated_total[30m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "No alerts generated recently"
|
||||
description: "No alerts have been generated in the last 30 minutes. This might indicate a problem with alert detection."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/NoAlertsGenerated"
|
||||
|
||||
- alert: SlowAlertResponseTime
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(alert_response_time_seconds_bucket[10m])) by (le)
|
||||
) > 3600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Slow alert response times"
|
||||
description: "P95 alert response time is above 1 hour (current: {{ $value | humanizeDuration }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/SlowAlertResponseTime"
|
||||
|
||||
- alert: CriticalAlertsUnacknowledged
|
||||
expr: |
|
||||
sum(alerts_unacknowledged{severity="critical"}) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Multiple critical alerts unacknowledged"
|
||||
description: "{{ $value }} critical alerts have not been acknowledged for 10+ minutes."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/CriticalAlertsUnacknowledged"
|
||||
|
||||
# Alert System Capacity
|
||||
- name: alert_system_capacity
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: LargeSSEMessageQueues
|
||||
expr: |
|
||||
sse_message_queue_size > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Large SSE message queues detected"
|
||||
description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages queued."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/LargeSSEMessageQueues"
|
||||
|
||||
- alert: SlowDatabaseStorage
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(alert_storage_duration_seconds_bucket[5m])) by (le)
|
||||
) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Slow alert database storage"
|
||||
description: "P95 alert storage latency is above 1 second (current: {{ $value }}s)."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/SlowDatabaseStorage"
|
||||
|
||||
# Alert System Critical Scenarios
|
||||
- name: alert_system_critical
|
||||
interval: 15s
|
||||
rules:
|
||||
- alert: AlertSystemDown
|
||||
expr: |
|
||||
up{service=~"alert-processor|notification-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Alert system is completely down"
|
||||
description: "Core alert system service {{ $labels.service }} is down."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/AlertSystemDown"
|
||||
|
||||
- alert: AlertDataNotPersisted
|
||||
expr: |
|
||||
(
|
||||
sum(rate(alerts_processed_total[2m]))
|
||||
-
|
||||
sum(rate(alerts_stored_total[2m]))
|
||||
) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Alerts not being persisted to database"
|
||||
description: "Alerts are being processed but not stored in the database."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/AlertDataNotPersisted"
|
||||
|
||||
- alert: NotificationsNotDelivered
|
||||
expr: |
|
||||
(
|
||||
sum(rate(alerts_processed_total[3m]))
|
||||
-
|
||||
sum(rate(notifications_sent_total[3m]))
|
||||
) > 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Notifications not being delivered"
|
||||
description: "Alerts are being processed but notifications are not being sent."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/NotificationsNotDelivered"
|
||||
|
||||
# Monitoring System Self-Monitoring
|
||||
- name: monitoring_health
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: PrometheusDown
|
||||
expr: up{job="prometheus"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
component: monitoring
|
||||
annotations:
|
||||
summary: "Prometheus is down"
|
||||
description: "Prometheus monitoring system is not responding."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/PrometheusDown"
|
||||
|
||||
- alert: AlertManagerDown
|
||||
expr: up{job="alertmanager"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: monitoring
|
||||
annotations:
|
||||
summary: "AlertManager is down"
|
||||
description: "AlertManager is not responding. Alerts will not be routed."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/AlertManagerDown"
|
||||
|
||||
- alert: PrometheusStorageFull
|
||||
expr: |
|
||||
(
|
||||
prometheus_tsdb_storage_blocks_bytes
|
||||
/
|
||||
(prometheus_tsdb_storage_blocks_bytes + prometheus_tsdb_wal_size_bytes)
|
||||
) > 0.90
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
component: monitoring
|
||||
annotations:
|
||||
summary: "Prometheus storage almost full"
|
||||
description: "Prometheus storage is {{ $value | humanizePercentage }} full."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/PrometheusStorageFull"
|
||||
|
||||
- alert: PrometheusScrapeErrors
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_exceeded_sample_limit_total[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: monitoring
|
||||
annotations:
|
||||
summary: "Prometheus scrape errors detected"
|
||||
description: "Prometheus is experiencing scrape errors for target {{ $labels.job }}."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/PrometheusScrapeErrors"
|
||||
@@ -1,27 +0,0 @@
|
||||
---
|
||||
# InitContainer to substitute secrets into AlertManager config
|
||||
# This allows us to use environment variables from secrets in the config file
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alertmanager-init-script
|
||||
namespace: monitoring
|
||||
data:
|
||||
init-config.sh: |
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
# Read the template config
|
||||
TEMPLATE=$(cat /etc/alertmanager-template/alertmanager.yml)
|
||||
|
||||
# Substitute environment variables
|
||||
echo "$TEMPLATE" | \
|
||||
sed "s|{{ .smtp_host }}|${SMTP_HOST}|g" | \
|
||||
sed "s|{{ .smtp_from }}|${SMTP_FROM}|g" | \
|
||||
sed "s|{{ .smtp_username }}|${SMTP_USERNAME}|g" | \
|
||||
sed "s|{{ .smtp_password }}|${SMTP_PASSWORD}|g" | \
|
||||
sed "s|{{ .slack_webhook_url }}|${SLACK_WEBHOOK_URL}|g" \
|
||||
> /etc/alertmanager-final/alertmanager.yml
|
||||
|
||||
echo "AlertManager config initialized successfully"
|
||||
cat /etc/alertmanager-final/alertmanager.yml
|
||||
@@ -1,391 +0,0 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alertmanager-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
alertmanager.yml: |
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
smtp_smarthost: '{{ .smtp_host }}'
|
||||
smtp_from: '{{ .smtp_from }}'
|
||||
smtp_auth_username: '{{ .smtp_username }}'
|
||||
smtp_auth_password: '{{ .smtp_password }}'
|
||||
smtp_require_tls: true
|
||||
|
||||
# Define notification templates
|
||||
templates:
|
||||
- '/etc/alertmanager/templates/*.tmpl'
|
||||
|
||||
# Route alerts to appropriate receivers
|
||||
route:
|
||||
# Default receiver
|
||||
receiver: 'default-email'
|
||||
# Group alerts by these labels
|
||||
group_by: ['alertname', 'cluster', 'service']
|
||||
# Wait time before sending initial notification
|
||||
group_wait: 10s
|
||||
# Wait time before sending notifications about new alerts in the group
|
||||
group_interval: 10s
|
||||
# Wait time before re-sending a notification
|
||||
repeat_interval: 12h
|
||||
|
||||
# Child routes for specific alert routing
|
||||
routes:
|
||||
# Critical alerts - send immediately to all channels
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-alerts'
|
||||
group_wait: 0s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
continue: true
|
||||
|
||||
# Warning alerts - less urgent
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'warning-alerts'
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
|
||||
# Alert system specific alerts
|
||||
- match:
|
||||
component: alert-system
|
||||
receiver: 'alert-system-team'
|
||||
group_wait: 10s
|
||||
repeat_interval: 6h
|
||||
|
||||
# Database alerts
|
||||
- match_re:
|
||||
alertname: ^(DatabaseConnectionHigh|SlowDatabaseStorage)$
|
||||
receiver: 'database-team'
|
||||
group_wait: 30s
|
||||
repeat_interval: 8h
|
||||
|
||||
# Infrastructure alerts
|
||||
- match_re:
|
||||
alertname: ^(HighMemoryUsage|ServiceDown)$
|
||||
receiver: 'infra-team'
|
||||
group_wait: 30s
|
||||
repeat_interval: 6h
|
||||
|
||||
# Inhibition rules - prevent alert spam
|
||||
inhibit_rules:
|
||||
# If service is down, inhibit all other alerts for that service
|
||||
- source_match:
|
||||
alertname: 'ServiceDown'
|
||||
target_match_re:
|
||||
alertname: '(HighErrorRate|HighResponseTime|HighMemoryUsage)'
|
||||
equal: ['service']
|
||||
|
||||
# If AlertSystem is completely down, inhibit component alerts
|
||||
- source_match:
|
||||
alertname: 'AlertSystemDown'
|
||||
target_match_re:
|
||||
alertname: 'AlertSystemComponent.*'
|
||||
equal: ['namespace']
|
||||
|
||||
# If RabbitMQ is down, inhibit alert processing errors
|
||||
- source_match:
|
||||
alertname: 'RabbitMQConnectionDown'
|
||||
target_match:
|
||||
alertname: 'HighAlertProcessingErrorRate'
|
||||
equal: ['namespace']
|
||||
|
||||
# Receivers - notification destinations
|
||||
receivers:
|
||||
# Default email receiver
|
||||
- name: 'default-email'
|
||||
email_configs:
|
||||
- to: 'alerts@yourdomain.com'
|
||||
headers:
|
||||
Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
|
||||
html: |
|
||||
{{ range .Alerts }}
|
||||
<h2>{{ .Labels.alertname }}</h2>
|
||||
<p><strong>Status:</strong> {{ .Status }}</p>
|
||||
<p><strong>Severity:</strong> {{ .Labels.severity }}</p>
|
||||
<p><strong>Service:</strong> {{ .Labels.service }}</p>
|
||||
<p><strong>Summary:</strong> {{ .Annotations.summary }}</p>
|
||||
<p><strong>Description:</strong> {{ .Annotations.description }}</p>
|
||||
<p><strong>Started:</strong> {{ .StartsAt }}</p>
|
||||
{{ if .EndsAt }}<p><strong>Ended:</strong> {{ .EndsAt }}</p>{{ end }}
|
||||
{{ end }}
|
||||
|
||||
# Critical alerts - multiple channels
|
||||
- name: 'critical-alerts'
|
||||
email_configs:
|
||||
- to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com'
|
||||
headers:
|
||||
Subject: '🚨 [CRITICAL] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
|
||||
send_resolved: true
|
||||
# Uncomment to enable Slack notifications
|
||||
# slack_configs:
|
||||
# - api_url: '{{ .slack_webhook_url }}'
|
||||
# channel: '#alerts-critical'
|
||||
# title: '🚨 Critical Alert'
|
||||
# text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
||||
# send_resolved: true
|
||||
|
||||
# Warning alerts
|
||||
- name: 'warning-alerts'
|
||||
email_configs:
|
||||
- to: 'alerts@yourdomain.com'
|
||||
headers:
|
||||
Subject: '⚠️ [WARNING] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
|
||||
send_resolved: true
|
||||
|
||||
# Alert system team
|
||||
- name: 'alert-system-team'
|
||||
email_configs:
|
||||
- to: 'alert-system-team@yourdomain.com'
|
||||
headers:
|
||||
Subject: '[Alert System] {{ .GroupLabels.alertname }}'
|
||||
send_resolved: true
|
||||
|
||||
# Database team
|
||||
- name: 'database-team'
|
||||
email_configs:
|
||||
- to: 'database-team@yourdomain.com'
|
||||
headers:
|
||||
Subject: '[Database] {{ .GroupLabels.alertname }}'
|
||||
send_resolved: true
|
||||
|
||||
# Infrastructure team
|
||||
- name: 'infra-team'
|
||||
email_configs:
|
||||
- to: 'infra-team@yourdomain.com'
|
||||
headers:
|
||||
Subject: '[Infrastructure] {{ .GroupLabels.alertname }}'
|
||||
send_resolved: true
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alertmanager-templates
|
||||
namespace: monitoring
|
||||
data:
|
||||
default.tmpl: |
|
||||
{{ define "cluster" }}{{ .ExternalURL | reReplaceAll ".*alertmanager\\.(.*)" "$1" }}{{ end }}
|
||||
|
||||
{{ define "slack.default.title" }}
|
||||
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.alertname }}
|
||||
{{ end }}
|
||||
|
||||
{{ define "slack.default.text" }}
|
||||
{{ range .Alerts }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
*Severity:* `{{ .Labels.severity }}`
|
||||
*Service:* `{{ .Labels.service }}`
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: alertmanager
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
serviceName: alertmanager
|
||||
replicas: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: alertmanager
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
serviceAccountName: prometheus
|
||||
initContainers:
|
||||
- name: init-config
|
||||
image: busybox:1.36
|
||||
command: ['/bin/sh', '/scripts/init-config.sh']
|
||||
env:
|
||||
- name: SMTP_HOST
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: smtp-host
|
||||
- name: SMTP_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: smtp-username
|
||||
- name: SMTP_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: smtp-password
|
||||
- name: SMTP_FROM
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: smtp-from
|
||||
- name: SLACK_WEBHOOK_URL
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: slack-webhook-url
|
||||
optional: true
|
||||
volumeMounts:
|
||||
- name: init-script
|
||||
mountPath: /scripts
|
||||
- name: config-template
|
||||
mountPath: /etc/alertmanager-template
|
||||
- name: config-final
|
||||
mountPath: /etc/alertmanager-final
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- alertmanager
|
||||
topologyKey: kubernetes.io/hostname
|
||||
containers:
|
||||
- name: alertmanager
|
||||
image: prom/alertmanager:v0.27.0
|
||||
args:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--cluster.listen-address=0.0.0.0:9094'
|
||||
- '--cluster.peer=alertmanager-0.alertmanager.monitoring.svc.cluster.local:9094'
|
||||
- '--cluster.peer=alertmanager-1.alertmanager.monitoring.svc.cluster.local:9094'
|
||||
- '--cluster.peer=alertmanager-2.alertmanager.monitoring.svc.cluster.local:9094'
|
||||
- '--cluster.reconnect-timeout=5m'
|
||||
- '--web.external-url=http://monitoring.bakery-ia.local/alertmanager'
|
||||
- '--web.route-prefix=/'
|
||||
ports:
|
||||
- name: web
|
||||
containerPort: 9093
|
||||
- name: mesh-tcp
|
||||
containerPort: 9094
|
||||
- name: mesh-udp
|
||||
containerPort: 9094
|
||||
protocol: UDP
|
||||
env:
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
volumeMounts:
|
||||
- name: config-final
|
||||
mountPath: /etc/alertmanager
|
||||
- name: templates
|
||||
mountPath: /etc/alertmanager/templates
|
||||
- name: storage
|
||||
mountPath: /alertmanager
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "256Mi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /-/healthy
|
||||
port: 9093
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /-/ready
|
||||
port: 9093
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
|
||||
# Config reloader sidecar
|
||||
- name: configmap-reload
|
||||
image: jimmidyson/configmap-reload:v0.12.0
|
||||
args:
|
||||
- '--webhook-url=http://localhost:9093/-/reload'
|
||||
- '--volume-dir=/etc/alertmanager'
|
||||
volumeMounts:
|
||||
- name: config-final
|
||||
mountPath: /etc/alertmanager
|
||||
readOnly: true
|
||||
resources:
|
||||
requests:
|
||||
memory: "16Mi"
|
||||
cpu: "10m"
|
||||
limits:
|
||||
memory: "32Mi"
|
||||
cpu: "50m"
|
||||
|
||||
volumes:
|
||||
- name: init-script
|
||||
configMap:
|
||||
name: alertmanager-init-script
|
||||
defaultMode: 0755
|
||||
- name: config-template
|
||||
configMap:
|
||||
name: alertmanager-config
|
||||
- name: config-final
|
||||
emptyDir: {}
|
||||
- name: templates
|
||||
configMap:
|
||||
name: alertmanager-templates
|
||||
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: storage
|
||||
spec:
|
||||
accessModes: [ "ReadWriteOnce" ]
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: alertmanager
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: web
|
||||
port: 9093
|
||||
targetPort: 9093
|
||||
- name: mesh-tcp
|
||||
port: 9094
|
||||
targetPort: 9094
|
||||
- name: mesh-udp
|
||||
port: 9094
|
||||
targetPort: 9094
|
||||
protocol: UDP
|
||||
selector:
|
||||
app: alertmanager
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: alertmanager-external
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: web
|
||||
port: 9093
|
||||
targetPort: 9093
|
||||
selector:
|
||||
app: alertmanager
|
||||
@@ -1,949 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboards-extended
|
||||
namespace: monitoring
|
||||
data:
|
||||
postgresql-dashboard.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - PostgreSQL Database",
|
||||
"tags": ["bakery-ia", "postgresql", "database"],
|
||||
"timezone": "browser",
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Active Connections by Database",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_stat_activity_count{state=\"active\"}",
|
||||
"legendFormat": "{{datname}} - active"
|
||||
},
|
||||
{
|
||||
"expr": "pg_stat_activity_count{state=\"idle\"}",
|
||||
"legendFormat": "{{datname}} - idle"
|
||||
},
|
||||
{
|
||||
"expr": "pg_stat_activity_count{state=\"idle in transaction\"}",
|
||||
"legendFormat": "{{datname}} - idle tx"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Total Connections",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(pg_stat_activity_count)",
|
||||
"legendFormat": "Total connections"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Max Connections",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_settings_max_connections",
|
||||
"legendFormat": "Max connections"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Transaction Rate (Commits vs Rollbacks)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(pg_stat_database_xact_commit[5m])",
|
||||
"legendFormat": "{{datname}} - commits"
|
||||
},
|
||||
{
|
||||
"expr": "rate(pg_stat_database_xact_rollback[5m])",
|
||||
"legendFormat": "{{datname}} - rollbacks"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Cache Hit Ratio",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(rate(pg_stat_io_blocks_read_total[5m])) / (sum(rate(pg_stat_io_blocks_read_total[5m])) + sum(rate(pg_stat_io_blocks_hit_total[5m])))))",
|
||||
"legendFormat": "Cache hit ratio %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Slow Queries (> 30s)",
|
||||
"type": "table",
|
||||
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_slow_queries{duration_ms > 30000}",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"query": "Query",
|
||||
"duration_ms": "Duration (ms)",
|
||||
"datname": "Database"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Dead Tuples by Table",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_stat_user_tables_n_dead_tup",
|
||||
"legendFormat": "{{schemaname}}.{{relname}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Table Bloat Estimate",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (pg_stat_user_tables_n_dead_tup * avg_tuple_size) / (pg_total_relation_size * 8192)",
|
||||
"legendFormat": "{{schemaname}}.{{relname}} bloat %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Replication Lag (bytes)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_replication_lag_bytes",
|
||||
"legendFormat": "{{slot_name}} - {{application_name}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Database Size (GB)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_database_size_bytes / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{datname}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Database Size Growth (per hour)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(pg_database_size_bytes[1h])",
|
||||
"legendFormat": "{{datname}} - bytes/hour"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Lock Counts by Type",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_locks_count",
|
||||
"legendFormat": "{{datname}} - {{locktype}} - {{mode}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"title": "Query Duration (p95)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 40, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(pg_query_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "p95"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
node-exporter-dashboard.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - Node Exporter Infrastructure",
|
||||
"tags": ["bakery-ia", "node-exporter", "infrastructure"],
|
||||
"timezone": "browser",
|
||||
"refresh": "15s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "CPU Usage by Node",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
||||
"legendFormat": "{{instance}} - {{cpu}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Average CPU Usage",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
||||
"legendFormat": "Average CPU %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "CPU Load (1m, 5m, 15m)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg(node_load1)",
|
||||
"legendFormat": "1m"
|
||||
},
|
||||
{
|
||||
"expr": "avg(node_load5)",
|
||||
"legendFormat": "5m"
|
||||
},
|
||||
{
|
||||
"expr": "avg(node_load15)",
|
||||
"legendFormat": "15m"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Memory Usage by Node",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Memory Used (GB)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Memory Available (GB)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_MemAvailable_bytes / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Disk I/O Read Rate (MB/s)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_disk_read_bytes_total[5m]) / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - {{device}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Disk I/O Write Rate (MB/s)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_disk_written_bytes_total[5m]) / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - {{device}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Disk I/O Operations (IOPS)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_disk_reads_completed_total[5m]) + rate(node_disk_writes_completed_total[5m])",
|
||||
"legendFormat": "{{instance}} - {{device}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Network Receive Rate (Mbps)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - {{device}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Network Transmit Rate (Mbps)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - {{device}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Network Errors",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_errs_total[5m]) + rate(node_network_transmit_errs_total[5m])",
|
||||
"legendFormat": "{{instance}} - {{device}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"title": "Filesystem Usage by Mount",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes))",
|
||||
"legendFormat": "{{instance}} - {{mountpoint}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"title": "Filesystem Available (GB)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_filesystem_avail_bytes / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - {{mountpoint}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"title": "Filesystem Size (GB)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_filesystem_size_bytes / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - {{mountpoint}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"title": "Load Average (1m, 5m, 15m)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load1",
|
||||
"legendFormat": "{{instance}} - 1m"
|
||||
},
|
||||
{
|
||||
"expr": "node_load5",
|
||||
"legendFormat": "{{instance}} - 5m"
|
||||
},
|
||||
{
|
||||
"expr": "node_load15",
|
||||
"legendFormat": "{{instance}} - 15m"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"title": "System Up Time",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_boot_time_seconds",
|
||||
"legendFormat": "{{instance}} - uptime"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"title": "Context Switches",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 56, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_context_switches_total[5m])",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"title": "Interrupts",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 56, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_intr_total[5m])",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
alertmanager-dashboard.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - AlertManager Monitoring",
|
||||
"tags": ["bakery-ia", "alertmanager", "alerting"],
|
||||
"timezone": "browser",
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Active Alerts by Severity",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by (severity) (ALERTS{alertstate=\"firing\"})",
|
||||
"legendFormat": "{{severity}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Total Active Alerts",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\"})",
|
||||
"legendFormat": "Active alerts"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Critical Alerts",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\", severity=\"critical\"})",
|
||||
"legendFormat": "Critical"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Alert Firing Rate (per minute)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(alertmanager_alerts_fired_total[1m])",
|
||||
"legendFormat": "Alerts fired/min"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Alert Resolution Rate (per minute)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(alertmanager_alerts_resolved_total[1m])",
|
||||
"legendFormat": "Alerts resolved/min"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Notification Success Rate",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (rate(alertmanager_notifications_total{status=\"success\"}[5m]) / rate(alertmanager_notifications_total[5m]))",
|
||||
"legendFormat": "Success rate %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Notification Failures",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(alertmanager_notifications_total{status=\"failed\"}[5m])",
|
||||
"legendFormat": "{{integration}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Silenced Alerts",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"silenced\"})",
|
||||
"legendFormat": "Silenced"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "AlertManager Cluster Size",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(alertmanager_cluster_peers)",
|
||||
"legendFormat": "Cluster peers"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "AlertManager Peers",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 24, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "alertmanager_cluster_peers",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Cluster Status",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 24, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=\"alertmanager\"}",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Alerts by Group",
|
||||
"type": "table",
|
||||
"gridPos": {"x": 0, "y": 28, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by (alertname) (ALERTS{alertstate=\"firing\"})",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"alertname": "Alert Name",
|
||||
"Value": "Count"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"title": "Alert Duration (p99)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 28, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, rate(alertmanager_alert_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "p99 duration"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"title": "Processing Time",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 36, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(alertmanager_receiver_processing_duration_seconds_sum[5m]) / rate(alertmanager_receiver_processing_duration_seconds_count[5m])",
|
||||
"legendFormat": "{{receiver}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"title": "Memory Usage",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 36, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "process_resident_memory_bytes{job=\"alertmanager\"} / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - MB"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
business-metrics-dashboard.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - Business Metrics & KPIs",
|
||||
"tags": ["bakery-ia", "business-metrics", "kpis"],
|
||||
"timezone": "browser",
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Requests per Service (Rate)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (service) (rate(http_requests_total[5m]))",
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Total Request Rate",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total[5m]))",
|
||||
"legendFormat": "requests/sec"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Peak Request Rate (5m)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max(sum(rate(http_requests_total[5m])))",
|
||||
"legendFormat": "Peak requests/sec"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Error Rates by Service",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))",
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Overall Error Rate",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])))",
|
||||
"legendFormat": "Error %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "4xx Error Rate",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (sum(rate(http_requests_total{status_code=~\"4..\"}[5m])) / sum(rate(http_requests_total[5m])))",
|
||||
"legendFormat": "4xx %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "P95 Latency by Service (ms)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
|
||||
"legendFormat": "{{service}} p95"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "P99 Latency by Service (ms)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
|
||||
"legendFormat": "{{service}} p99"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Average Latency (ms)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(sum(rate(http_request_duration_seconds_sum[5m])) / sum(rate(http_request_duration_seconds_count[5m]))) * 1000",
|
||||
"legendFormat": "Avg latency ms"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Active Tenants",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(count by (tenant_id) (rate(http_requests_total[5m])))",
|
||||
"legendFormat": "Active tenants"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Requests per Tenant",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (tenant_id) (rate(http_requests_total[5m]))",
|
||||
"legendFormat": "Tenant {{tenant_id}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Alert Generation Rate (per minute)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ALERTS_FOR_STATE[1m])",
|
||||
"legendFormat": "{{alertname}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"title": "Training Job Success Rate",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (sum(training_job_completed_total{status=\"success\"}) / sum(training_job_completed_total))",
|
||||
"legendFormat": "Success rate %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"title": "Training Jobs in Progress",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 0, "y": 40, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(training_job_in_progress)",
|
||||
"legendFormat": "Jobs running"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"title": "Training Job Completion Time (p95, minutes)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 6, "y": 40, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, training_job_duration_seconds) / 60",
|
||||
"legendFormat": "p95 minutes"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"title": "Failed Training Jobs",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(training_job_completed_total{status=\"failed\"})",
|
||||
"legendFormat": "Failed jobs"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"title": "Total Training Jobs Completed",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(training_job_completed_total)",
|
||||
"legendFormat": "Total completed"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"title": "API Health Status",
|
||||
"type": "table",
|
||||
"gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=\"bakery-services\"}",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"service": "Service",
|
||||
"Value": "Status",
|
||||
"instance": "Instance"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"title": "Service Success Rate (%)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum by (service) (rate(http_requests_total[5m]))))",
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"title": "Requests Processed Today",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 0, "y": 56, "w": 12, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(http_requests_total[24h]))",
|
||||
"legendFormat": "Requests (24h)"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"title": "Distinct Users Today",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 56, "w": 12, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(count by (user_id) (increase(http_requests_total{user_id!=\"\"}[24h])))",
|
||||
"legendFormat": "Users (24h)"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -1,177 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboards
|
||||
namespace: monitoring
|
||||
data:
|
||||
gateway-metrics.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - Gateway Metrics",
|
||||
"tags": ["bakery-ia", "gateway"],
|
||||
"timezone": "browser",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Request Rate by Endpoint",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "rate(http_requests_total{service=\"gateway\"}[5m])",
|
||||
"legendFormat": "{{method}} {{endpoint}}"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "P95 Request Latency",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"gateway\"}[5m]))",
|
||||
"legendFormat": "{{endpoint}} p95"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Error Rate (5xx)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "rate(http_requests_total{service=\"gateway\",status_code=~\"5..\"}[5m])",
|
||||
"legendFormat": "{{endpoint}} errors"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Active Requests",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
|
||||
"targets": [{
|
||||
"expr": "sum(rate(http_requests_total{service=\"gateway\"}[1m]))"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Authentication Success Rate",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
|
||||
"targets": [{
|
||||
"expr": "rate(gateway_auth_responses_total[5m]) / rate(gateway_auth_requests_total[5m]) * 100"
|
||||
}]
|
||||
}
|
||||
],
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1
|
||||
}
|
||||
}
|
||||
|
||||
services-overview.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - Services Overview",
|
||||
"tags": ["bakery-ia", "services"],
|
||||
"timezone": "browser",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Request Rate by Service",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "sum by (service) (rate(http_requests_total[5m]))",
|
||||
"legendFormat": "{{service}}"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "P99 Latency by Service",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m])))",
|
||||
"legendFormat": "{{service}} p99"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Error Rate by Service",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 8, "w": 24, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))",
|
||||
"legendFormat": "{{service}}"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Service Health Status",
|
||||
"type": "table",
|
||||
"gridPos": {"x": 0, "y": 16, "w": 24, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "up{job=\"bakery-services\"}",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}],
|
||||
"transformations": [{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"service": "Service Name",
|
||||
"Value": "Status"
|
||||
}
|
||||
}
|
||||
}]
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1
|
||||
}
|
||||
}
|
||||
|
||||
circuit-breakers.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - Circuit Breakers",
|
||||
"tags": ["bakery-ia", "reliability"],
|
||||
"timezone": "browser",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Circuit Breaker States",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 24, "h": 4},
|
||||
"targets": [{
|
||||
"expr": "circuit_breaker_state",
|
||||
"legendFormat": "{{service}} - {{state}}"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Circuit Breaker Trips",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 4, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "rate(circuit_breaker_opened_total[5m])",
|
||||
"legendFormat": "{{service}}"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Rejected Requests",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 4, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "rate(circuit_breaker_rejected_total[5m])",
|
||||
"legendFormat": "{{service}}"
|
||||
}]
|
||||
}
|
||||
],
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1
|
||||
}
|
||||
}
|
||||
@@ -1,166 +0,0 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-datasources
|
||||
namespace: monitoring
|
||||
data:
|
||||
prometheus.yaml: |
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboards-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
dashboards.yaml: |
|
||||
apiVersion: 1
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: 'Bakery IA'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
- name: 'extended'
|
||||
orgId: 1
|
||||
folder: 'Bakery IA - Extended'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards-extended
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: grafana
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: grafana
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: grafana
|
||||
spec:
|
||||
containers:
|
||||
- name: grafana
|
||||
image: grafana/grafana:12.3.0
|
||||
ports:
|
||||
- containerPort: 3000
|
||||
name: http
|
||||
env:
|
||||
- name: GF_SECURITY_ADMIN_USER
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: grafana-admin
|
||||
key: admin-user
|
||||
- name: GF_SECURITY_ADMIN_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: grafana-admin
|
||||
key: admin-password
|
||||
- name: GF_SERVER_ROOT_URL
|
||||
value: "http://monitoring.bakery-ia.local/grafana"
|
||||
- name: GF_SERVER_SERVE_FROM_SUB_PATH
|
||||
value: "true"
|
||||
- name: GF_AUTH_ANONYMOUS_ENABLED
|
||||
value: "false"
|
||||
- name: GF_INSTALL_PLUGINS
|
||||
value: ""
|
||||
volumeMounts:
|
||||
- name: grafana-storage
|
||||
mountPath: /var/lib/grafana
|
||||
- name: grafana-datasources
|
||||
mountPath: /etc/grafana/provisioning/datasources
|
||||
- name: grafana-dashboards-config
|
||||
mountPath: /etc/grafana/provisioning/dashboards
|
||||
- name: grafana-dashboards
|
||||
mountPath: /var/lib/grafana/dashboards
|
||||
- name: grafana-dashboards-extended
|
||||
mountPath: /var/lib/grafana/dashboards-extended
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /api/health
|
||||
port: 3000
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /api/health
|
||||
port: 3000
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: grafana-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: grafana-storage
|
||||
- name: grafana-datasources
|
||||
configMap:
|
||||
name: grafana-datasources
|
||||
- name: grafana-dashboards-config
|
||||
configMap:
|
||||
name: grafana-dashboards-config
|
||||
- name: grafana-dashboards
|
||||
configMap:
|
||||
name: grafana-dashboards
|
||||
- name: grafana-dashboards-extended
|
||||
configMap:
|
||||
name: grafana-dashboards-extended
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: grafana-storage
|
||||
namespace: monitoring
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: grafana
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 3000
|
||||
targetPort: 3000
|
||||
protocol: TCP
|
||||
name: http
|
||||
selector:
|
||||
app: grafana
|
||||
@@ -1,100 +0,0 @@
|
||||
---
|
||||
# PodDisruptionBudgets ensure minimum availability during voluntary disruptions
|
||||
# (node drains, rolling updates, etc.)
|
||||
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: prometheus-pdb
|
||||
namespace: monitoring
|
||||
spec:
|
||||
minAvailable: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: prometheus
|
||||
|
||||
---
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: alertmanager-pdb
|
||||
namespace: monitoring
|
||||
spec:
|
||||
minAvailable: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app: alertmanager
|
||||
|
||||
---
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: grafana-pdb
|
||||
namespace: monitoring
|
||||
spec:
|
||||
minAvailable: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: grafana
|
||||
|
||||
---
|
||||
# ResourceQuota limits total resources in monitoring namespace
|
||||
apiVersion: v1
|
||||
kind: ResourceQuota
|
||||
metadata:
|
||||
name: monitoring-quota
|
||||
namespace: monitoring
|
||||
spec:
|
||||
hard:
|
||||
# Compute resources
|
||||
requests.cpu: "10"
|
||||
requests.memory: "16Gi"
|
||||
limits.cpu: "20"
|
||||
limits.memory: "32Gi"
|
||||
|
||||
# Storage
|
||||
persistentvolumeclaims: "10"
|
||||
requests.storage: "100Gi"
|
||||
|
||||
# Object counts
|
||||
pods: "50"
|
||||
services: "20"
|
||||
configmaps: "30"
|
||||
secrets: "20"
|
||||
|
||||
---
|
||||
# LimitRange sets default resource limits for pods in monitoring namespace
|
||||
apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: monitoring-limits
|
||||
namespace: monitoring
|
||||
spec:
|
||||
limits:
|
||||
# Default container limits
|
||||
- max:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
min:
|
||||
cpu: "10m"
|
||||
memory: "16Mi"
|
||||
default:
|
||||
cpu: "500m"
|
||||
memory: "512Mi"
|
||||
defaultRequest:
|
||||
cpu: "100m"
|
||||
memory: "128Mi"
|
||||
type: Container
|
||||
|
||||
# Pod limits
|
||||
- max:
|
||||
cpu: "4"
|
||||
memory: "8Gi"
|
||||
type: Pod
|
||||
|
||||
# PVC limits
|
||||
- max:
|
||||
storage: "50Gi"
|
||||
min:
|
||||
storage: "1Gi"
|
||||
type: PersistentVolumeClaim
|
||||
@@ -1,42 +0,0 @@
|
||||
---
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: monitoring-ingress
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/rewrite-target: /$2
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "false"
|
||||
spec:
|
||||
rules:
|
||||
- host: monitoring.bakery-ia.local
|
||||
http:
|
||||
paths:
|
||||
- path: /grafana(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: grafana
|
||||
port:
|
||||
number: 3000
|
||||
- path: /prometheus(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: prometheus-external
|
||||
port:
|
||||
number: 9090
|
||||
- path: /jaeger(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: jaeger-query
|
||||
port:
|
||||
number: 16686
|
||||
- path: /alertmanager(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: alertmanager-external
|
||||
port:
|
||||
number: 9093
|
||||
@@ -1,190 +0,0 @@
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: jaeger
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: jaeger
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: jaeger
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: jaeger
|
||||
spec:
|
||||
containers:
|
||||
- name: jaeger
|
||||
image: jaegertracing/all-in-one:1.51
|
||||
env:
|
||||
- name: COLLECTOR_ZIPKIN_HOST_PORT
|
||||
value: ":9411"
|
||||
- name: COLLECTOR_OTLP_ENABLED
|
||||
value: "true"
|
||||
- name: SPAN_STORAGE_TYPE
|
||||
value: "badger"
|
||||
- name: BADGER_EPHEMERAL
|
||||
value: "false"
|
||||
- name: BADGER_DIRECTORY_VALUE
|
||||
value: "/badger/data"
|
||||
- name: BADGER_DIRECTORY_KEY
|
||||
value: "/badger/key"
|
||||
ports:
|
||||
- containerPort: 5775
|
||||
protocol: UDP
|
||||
name: zipkin-compact
|
||||
- containerPort: 6831
|
||||
protocol: UDP
|
||||
name: jaeger-compact
|
||||
- containerPort: 6832
|
||||
protocol: UDP
|
||||
name: jaeger-binary
|
||||
- containerPort: 5778
|
||||
protocol: TCP
|
||||
name: config-rest
|
||||
- containerPort: 16686
|
||||
protocol: TCP
|
||||
name: query
|
||||
- containerPort: 14250
|
||||
protocol: TCP
|
||||
name: grpc
|
||||
- containerPort: 14268
|
||||
protocol: TCP
|
||||
name: c-tchan-trft
|
||||
- containerPort: 14269
|
||||
protocol: TCP
|
||||
name: admin-http
|
||||
- containerPort: 9411
|
||||
protocol: TCP
|
||||
name: zipkin
|
||||
- containerPort: 4317
|
||||
protocol: TCP
|
||||
name: otlp-grpc
|
||||
- containerPort: 4318
|
||||
protocol: TCP
|
||||
name: otlp-http
|
||||
volumeMounts:
|
||||
- name: jaeger-storage
|
||||
mountPath: /badger
|
||||
resources:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 14269
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 14269
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: jaeger-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: jaeger-storage
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: jaeger-storage
|
||||
namespace: monitoring
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: jaeger-query
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: jaeger
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 16686
|
||||
targetPort: 16686
|
||||
protocol: TCP
|
||||
name: query
|
||||
selector:
|
||||
app: jaeger
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: jaeger-collector
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: jaeger
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 14268
|
||||
targetPort: 14268
|
||||
protocol: TCP
|
||||
name: c-tchan-trft
|
||||
- port: 14250
|
||||
targetPort: 14250
|
||||
protocol: TCP
|
||||
name: grpc
|
||||
- port: 9411
|
||||
targetPort: 9411
|
||||
protocol: TCP
|
||||
name: zipkin
|
||||
- port: 4317
|
||||
targetPort: 4317
|
||||
protocol: TCP
|
||||
name: otlp-grpc
|
||||
- port: 4318
|
||||
targetPort: 4318
|
||||
protocol: TCP
|
||||
name: otlp-http
|
||||
selector:
|
||||
app: jaeger
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: jaeger-agent
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: jaeger
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- port: 5775
|
||||
targetPort: 5775
|
||||
protocol: UDP
|
||||
name: zipkin-compact
|
||||
- port: 6831
|
||||
targetPort: 6831
|
||||
protocol: UDP
|
||||
name: jaeger-compact
|
||||
- port: 6832
|
||||
targetPort: 6832
|
||||
protocol: UDP
|
||||
name: jaeger-binary
|
||||
- port: 5778
|
||||
targetPort: 5778
|
||||
protocol: TCP
|
||||
name: config-rest
|
||||
selector:
|
||||
app: jaeger
|
||||
@@ -1,18 +1,20 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
# Minimal Monitoring Infrastructure
|
||||
# SigNoz is now managed via Helm in the 'signoz' namespace
|
||||
# This kustomization only maintains:
|
||||
# - Namespace for legacy resources (if needed)
|
||||
# - Node exporter for infrastructure metrics
|
||||
# - PostgreSQL exporter for database metrics
|
||||
# - Optional OTEL collector (can be disabled if using SigNoz's built-in collector)
|
||||
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- secrets.yaml
|
||||
- prometheus.yaml
|
||||
- alert-rules.yaml
|
||||
- alertmanager.yaml
|
||||
- alertmanager-init.yaml
|
||||
- grafana.yaml
|
||||
- grafana-dashboards.yaml
|
||||
- grafana-dashboards-extended.yaml
|
||||
- postgres-exporter.yaml
|
||||
# Exporters for metrics collection
|
||||
- node-exporter.yaml
|
||||
- jaeger.yaml
|
||||
- ha-policies.yaml
|
||||
- ingress.yaml
|
||||
- postgres-exporter.yaml
|
||||
# Optional: Keep OTEL collector or use SigNoz's built-in one
|
||||
# Uncomment if you want a dedicated OTEL collector in monitoring namespace
|
||||
# - otel-collector.yaml
|
||||
|
||||
@@ -0,0 +1,167 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: otel-collector-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
otel-collector-config.yaml: |
|
||||
extensions:
|
||||
health_check:
|
||||
endpoint: 0.0.0.0:13133
|
||||
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 10s
|
||||
send_batch_size: 1024
|
||||
|
||||
# Memory limiter to prevent OOM
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 512
|
||||
spike_limit_mib: 128
|
||||
|
||||
exporters:
|
||||
# Export metrics to Prometheus
|
||||
prometheus:
|
||||
endpoint: "0.0.0.0:8889"
|
||||
namespace: otelcol
|
||||
const_labels:
|
||||
source: otel-collector
|
||||
|
||||
# Export to SigNoz
|
||||
otlp/signoz:
|
||||
endpoint: "signoz-query-service.monitoring.svc.cluster.local:8080"
|
||||
tls:
|
||||
insecure: true
|
||||
|
||||
# Logging exporter for debugging traces and logs
|
||||
logging:
|
||||
loglevel: info
|
||||
sampling_initial: 5
|
||||
sampling_thereafter: 200
|
||||
|
||||
service:
|
||||
extensions: [health_check]
|
||||
pipelines:
|
||||
# Traces pipeline: receive -> process -> export to SigNoz
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [otlp/signoz, logging]
|
||||
|
||||
# Metrics pipeline: receive -> process -> export to both Prometheus and SigNoz
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [prometheus, otlp/signoz]
|
||||
|
||||
# Logs pipeline: receive -> process -> export to SigNoz
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [otlp/signoz, logging]
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: otel-collector
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: otel-collector
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: otel-collector
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: otel-collector
|
||||
spec:
|
||||
containers:
|
||||
- name: otel-collector
|
||||
image: otel/opentelemetry-collector-contrib:0.91.0
|
||||
args:
|
||||
- --config=/conf/otel-collector-config.yaml
|
||||
ports:
|
||||
- containerPort: 4317
|
||||
protocol: TCP
|
||||
name: otlp-grpc
|
||||
- containerPort: 4318
|
||||
protocol: TCP
|
||||
name: otlp-http
|
||||
- containerPort: 8889
|
||||
protocol: TCP
|
||||
name: prometheus
|
||||
- containerPort: 13133
|
||||
protocol: TCP
|
||||
name: health-check
|
||||
volumeMounts:
|
||||
- name: otel-collector-config
|
||||
mountPath: /conf
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 13133
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 13133
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: otel-collector-config
|
||||
configMap:
|
||||
name: otel-collector-config
|
||||
items:
|
||||
- key: otel-collector-config.yaml
|
||||
path: otel-collector-config.yaml
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: otel-collector
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: otel-collector
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8889"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 4317
|
||||
targetPort: 4317
|
||||
protocol: TCP
|
||||
name: otlp-grpc
|
||||
- port: 4318
|
||||
targetPort: 4318
|
||||
protocol: TCP
|
||||
name: otlp-http
|
||||
- port: 8889
|
||||
targetPort: 8889
|
||||
protocol: TCP
|
||||
name: prometheus
|
||||
selector:
|
||||
app: otel-collector
|
||||
@@ -1,278 +0,0 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: prometheus
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes
|
||||
- nodes/proxy
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups:
|
||||
- extensions
|
||||
resources:
|
||||
- ingresses
|
||||
verbs: ["get", "list", "watch"]
|
||||
- nonResourceURLs: ["/metrics"]
|
||||
verbs: ["get"]
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: prometheus
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: prometheus
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
prometheus.yml: |
|
||||
global:
|
||||
scrape_interval: 30s
|
||||
evaluation_interval: 30s
|
||||
external_labels:
|
||||
cluster: 'bakery-ia'
|
||||
environment: 'production'
|
||||
|
||||
# AlertManager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093
|
||||
- alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093
|
||||
- alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093
|
||||
|
||||
# Load alert rules
|
||||
rule_files:
|
||||
- '/etc/prometheus/rules/*.yml'
|
||||
|
||||
scrape_configs:
|
||||
# Scrape Prometheus itself
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Scrape all bakery-ia services
|
||||
- job_name: 'bakery-services'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- bakery-ia
|
||||
relabel_configs:
|
||||
# Only scrape pods with metrics port
|
||||
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
||||
action: keep
|
||||
regex: http
|
||||
|
||||
# Add service name label
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
||||
target_label: service
|
||||
|
||||
# Add component label
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
||||
target_label: component
|
||||
|
||||
# Add pod name
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: pod
|
||||
|
||||
# Set metrics path
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
||||
action: replace
|
||||
target_label: __metrics_path__
|
||||
regex: (.+)
|
||||
|
||||
# Set scrape port
|
||||
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
||||
action: replace
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
target_label: __address__
|
||||
|
||||
# Scrape Kubernetes nodes
|
||||
- job_name: 'kubernetes-nodes'
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
relabel_configs:
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_node_label_(.+)
|
||||
- target_label: __address__
|
||||
replacement: kubernetes.default.svc:443
|
||||
- source_labels: [__meta_kubernetes_node_name]
|
||||
regex: (.+)
|
||||
target_label: __metrics_path__
|
||||
replacement: /api/v1/nodes/${1}/proxy/metrics
|
||||
|
||||
# Scrape AlertManager
|
||||
- job_name: 'alertmanager'
|
||||
static_configs:
|
||||
- targets:
|
||||
- alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093
|
||||
- alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093
|
||||
- alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093
|
||||
|
||||
# Scrape PostgreSQL exporter
|
||||
- job_name: 'postgres-exporter'
|
||||
static_configs:
|
||||
- targets: ['postgres-exporter.monitoring.svc.cluster.local:9187']
|
||||
|
||||
# Scrape Node Exporter
|
||||
- job_name: 'node-exporter'
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
regex: '(.*):10250'
|
||||
replacement: '${1}:9100'
|
||||
target_label: __address__
|
||||
- source_labels: [__meta_kubernetes_node_name]
|
||||
target_label: node
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
serviceName: prometheus
|
||||
replicas: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app: prometheus
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
serviceAccountName: prometheus
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- prometheus
|
||||
topologyKey: kubernetes.io/hostname
|
||||
containers:
|
||||
- name: prometheus
|
||||
image: prom/prometheus:v3.0.1
|
||||
args:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
||||
- '--web.console.templates=/usr/share/prometheus/consoles'
|
||||
- '--web.enable-lifecycle'
|
||||
ports:
|
||||
- containerPort: 9090
|
||||
name: web
|
||||
volumeMounts:
|
||||
- name: prometheus-config
|
||||
mountPath: /etc/prometheus
|
||||
- name: prometheus-rules
|
||||
mountPath: /etc/prometheus/rules
|
||||
- name: prometheus-storage
|
||||
mountPath: /prometheus
|
||||
resources:
|
||||
requests:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
limits:
|
||||
memory: "2Gi"
|
||||
cpu: "1"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /-/healthy
|
||||
port: 9090
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /-/ready
|
||||
port: 9090
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: prometheus-config
|
||||
configMap:
|
||||
name: prometheus-config
|
||||
- name: prometheus-rules
|
||||
configMap:
|
||||
name: prometheus-alert-rules
|
||||
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: prometheus-storage
|
||||
spec:
|
||||
accessModes: [ "ReadWriteOnce" ]
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- port: 9090
|
||||
targetPort: 9090
|
||||
protocol: TCP
|
||||
name: web
|
||||
selector:
|
||||
app: prometheus
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus-external
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 9090
|
||||
targetPort: 9090
|
||||
protocol: TCP
|
||||
name: web
|
||||
selector:
|
||||
app: prometheus
|
||||
Reference in New Issue
Block a user