Improve monitoring for prod
This commit is contained in:
@@ -0,0 +1,391 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alertmanager-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
alertmanager.yml: |
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
smtp_smarthost: '{{ .smtp_host }}'
|
||||
smtp_from: '{{ .smtp_from }}'
|
||||
smtp_auth_username: '{{ .smtp_username }}'
|
||||
smtp_auth_password: '{{ .smtp_password }}'
|
||||
smtp_require_tls: true
|
||||
|
||||
# Define notification templates
|
||||
templates:
|
||||
- '/etc/alertmanager/templates/*.tmpl'
|
||||
|
||||
# Route alerts to appropriate receivers
|
||||
route:
|
||||
# Default receiver
|
||||
receiver: 'default-email'
|
||||
# Group alerts by these labels
|
||||
group_by: ['alertname', 'cluster', 'service']
|
||||
# Wait time before sending initial notification
|
||||
group_wait: 10s
|
||||
# Wait time before sending notifications about new alerts in the group
|
||||
group_interval: 10s
|
||||
# Wait time before re-sending a notification
|
||||
repeat_interval: 12h
|
||||
|
||||
# Child routes for specific alert routing
|
||||
routes:
|
||||
# Critical alerts - send immediately to all channels
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-alerts'
|
||||
group_wait: 0s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
continue: true
|
||||
|
||||
# Warning alerts - less urgent
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'warning-alerts'
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
|
||||
# Alert system specific alerts
|
||||
- match:
|
||||
component: alert-system
|
||||
receiver: 'alert-system-team'
|
||||
group_wait: 10s
|
||||
repeat_interval: 6h
|
||||
|
||||
# Database alerts
|
||||
- match_re:
|
||||
alertname: ^(DatabaseConnectionHigh|SlowDatabaseStorage)$
|
||||
receiver: 'database-team'
|
||||
group_wait: 30s
|
||||
repeat_interval: 8h
|
||||
|
||||
# Infrastructure alerts
|
||||
- match_re:
|
||||
alertname: ^(HighMemoryUsage|ServiceDown)$
|
||||
receiver: 'infra-team'
|
||||
group_wait: 30s
|
||||
repeat_interval: 6h
|
||||
|
||||
# Inhibition rules - prevent alert spam
|
||||
inhibit_rules:
|
||||
# If service is down, inhibit all other alerts for that service
|
||||
- source_match:
|
||||
alertname: 'ServiceDown'
|
||||
target_match_re:
|
||||
alertname: '(HighErrorRate|HighResponseTime|HighMemoryUsage)'
|
||||
equal: ['service']
|
||||
|
||||
# If AlertSystem is completely down, inhibit component alerts
|
||||
- source_match:
|
||||
alertname: 'AlertSystemDown'
|
||||
target_match_re:
|
||||
alertname: 'AlertSystemComponent.*'
|
||||
equal: ['namespace']
|
||||
|
||||
# If RabbitMQ is down, inhibit alert processing errors
|
||||
- source_match:
|
||||
alertname: 'RabbitMQConnectionDown'
|
||||
target_match:
|
||||
alertname: 'HighAlertProcessingErrorRate'
|
||||
equal: ['namespace']
|
||||
|
||||
# Receivers - notification destinations
|
||||
receivers:
|
||||
# Default email receiver
|
||||
- name: 'default-email'
|
||||
email_configs:
|
||||
- to: 'alerts@yourdomain.com'
|
||||
headers:
|
||||
Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
|
||||
html: |
|
||||
{{ range .Alerts }}
|
||||
<h2>{{ .Labels.alertname }}</h2>
|
||||
<p><strong>Status:</strong> {{ .Status }}</p>
|
||||
<p><strong>Severity:</strong> {{ .Labels.severity }}</p>
|
||||
<p><strong>Service:</strong> {{ .Labels.service }}</p>
|
||||
<p><strong>Summary:</strong> {{ .Annotations.summary }}</p>
|
||||
<p><strong>Description:</strong> {{ .Annotations.description }}</p>
|
||||
<p><strong>Started:</strong> {{ .StartsAt }}</p>
|
||||
{{ if .EndsAt }}<p><strong>Ended:</strong> {{ .EndsAt }}</p>{{ end }}
|
||||
{{ end }}
|
||||
|
||||
# Critical alerts - multiple channels
|
||||
- name: 'critical-alerts'
|
||||
email_configs:
|
||||
- to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com'
|
||||
headers:
|
||||
Subject: '🚨 [CRITICAL] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
|
||||
send_resolved: true
|
||||
# Uncomment to enable Slack notifications
|
||||
# slack_configs:
|
||||
# - api_url: '{{ .slack_webhook_url }}'
|
||||
# channel: '#alerts-critical'
|
||||
# title: '🚨 Critical Alert'
|
||||
# text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
||||
# send_resolved: true
|
||||
|
||||
# Warning alerts
|
||||
- name: 'warning-alerts'
|
||||
email_configs:
|
||||
- to: 'alerts@yourdomain.com'
|
||||
headers:
|
||||
Subject: '⚠️ [WARNING] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
|
||||
send_resolved: true
|
||||
|
||||
# Alert system team
|
||||
- name: 'alert-system-team'
|
||||
email_configs:
|
||||
- to: 'alert-system-team@yourdomain.com'
|
||||
headers:
|
||||
Subject: '[Alert System] {{ .GroupLabels.alertname }}'
|
||||
send_resolved: true
|
||||
|
||||
# Database team
|
||||
- name: 'database-team'
|
||||
email_configs:
|
||||
- to: 'database-team@yourdomain.com'
|
||||
headers:
|
||||
Subject: '[Database] {{ .GroupLabels.alertname }}'
|
||||
send_resolved: true
|
||||
|
||||
# Infrastructure team
|
||||
- name: 'infra-team'
|
||||
email_configs:
|
||||
- to: 'infra-team@yourdomain.com'
|
||||
headers:
|
||||
Subject: '[Infrastructure] {{ .GroupLabels.alertname }}'
|
||||
send_resolved: true
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alertmanager-templates
|
||||
namespace: monitoring
|
||||
data:
|
||||
default.tmpl: |
|
||||
{{ define "cluster" }}{{ .ExternalURL | reReplaceAll ".*alertmanager\\.(.*)" "$1" }}{{ end }}
|
||||
|
||||
{{ define "slack.default.title" }}
|
||||
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.alertname }}
|
||||
{{ end }}
|
||||
|
||||
{{ define "slack.default.text" }}
|
||||
{{ range .Alerts }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
*Severity:* `{{ .Labels.severity }}`
|
||||
*Service:* `{{ .Labels.service }}`
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: alertmanager
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
serviceName: alertmanager
|
||||
replicas: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: alertmanager
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
serviceAccountName: prometheus
|
||||
initContainers:
|
||||
- name: init-config
|
||||
image: busybox:1.36
|
||||
command: ['/bin/sh', '/scripts/init-config.sh']
|
||||
env:
|
||||
- name: SMTP_HOST
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: smtp-host
|
||||
- name: SMTP_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: smtp-username
|
||||
- name: SMTP_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: smtp-password
|
||||
- name: SMTP_FROM
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: smtp-from
|
||||
- name: SLACK_WEBHOOK_URL
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: slack-webhook-url
|
||||
optional: true
|
||||
volumeMounts:
|
||||
- name: init-script
|
||||
mountPath: /scripts
|
||||
- name: config-template
|
||||
mountPath: /etc/alertmanager-template
|
||||
- name: config-final
|
||||
mountPath: /etc/alertmanager-final
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- alertmanager
|
||||
topologyKey: kubernetes.io/hostname
|
||||
containers:
|
||||
- name: alertmanager
|
||||
image: prom/alertmanager:v0.27.0
|
||||
args:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--cluster.listen-address=0.0.0.0:9094'
|
||||
- '--cluster.peer=alertmanager-0.alertmanager.monitoring.svc.cluster.local:9094'
|
||||
- '--cluster.peer=alertmanager-1.alertmanager.monitoring.svc.cluster.local:9094'
|
||||
- '--cluster.peer=alertmanager-2.alertmanager.monitoring.svc.cluster.local:9094'
|
||||
- '--cluster.reconnect-timeout=5m'
|
||||
- '--web.external-url=http://monitoring.bakery-ia.local/alertmanager'
|
||||
- '--web.route-prefix=/'
|
||||
ports:
|
||||
- name: web
|
||||
containerPort: 9093
|
||||
- name: mesh-tcp
|
||||
containerPort: 9094
|
||||
- name: mesh-udp
|
||||
containerPort: 9094
|
||||
protocol: UDP
|
||||
env:
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
volumeMounts:
|
||||
- name: config-final
|
||||
mountPath: /etc/alertmanager
|
||||
- name: templates
|
||||
mountPath: /etc/alertmanager/templates
|
||||
- name: storage
|
||||
mountPath: /alertmanager
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "256Mi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /-/healthy
|
||||
port: 9093
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /-/ready
|
||||
port: 9093
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
|
||||
# Config reloader sidecar
|
||||
- name: configmap-reload
|
||||
image: jimmidyson/configmap-reload:v0.12.0
|
||||
args:
|
||||
- '--webhook-url=http://localhost:9093/-/reload'
|
||||
- '--volume-dir=/etc/alertmanager'
|
||||
volumeMounts:
|
||||
- name: config-final
|
||||
mountPath: /etc/alertmanager
|
||||
readOnly: true
|
||||
resources:
|
||||
requests:
|
||||
memory: "16Mi"
|
||||
cpu: "10m"
|
||||
limits:
|
||||
memory: "32Mi"
|
||||
cpu: "50m"
|
||||
|
||||
volumes:
|
||||
- name: init-script
|
||||
configMap:
|
||||
name: alertmanager-init-script
|
||||
defaultMode: 0755
|
||||
- name: config-template
|
||||
configMap:
|
||||
name: alertmanager-config
|
||||
- name: config-final
|
||||
emptyDir: {}
|
||||
- name: templates
|
||||
configMap:
|
||||
name: alertmanager-templates
|
||||
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: storage
|
||||
spec:
|
||||
accessModes: [ "ReadWriteOnce" ]
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: alertmanager
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: web
|
||||
port: 9093
|
||||
targetPort: 9093
|
||||
- name: mesh-tcp
|
||||
port: 9094
|
||||
targetPort: 9094
|
||||
- name: mesh-udp
|
||||
port: 9094
|
||||
targetPort: 9094
|
||||
protocol: UDP
|
||||
selector:
|
||||
app: alertmanager
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: alertmanager-external
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: web
|
||||
port: 9093
|
||||
targetPort: 9093
|
||||
selector:
|
||||
app: alertmanager
|
||||
Reference in New Issue
Block a user