Files
bakery-ia/infrastructure/kubernetes/base/components/monitoring/alertmanager.yaml
2026-01-07 19:12:35 +01:00

392 lines
11 KiB
YAML

---
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
namespace: monitoring
data:
alertmanager.yml: |
global:
resolve_timeout: 5m
smtp_smarthost: '{{ .smtp_host }}'
smtp_from: '{{ .smtp_from }}'
smtp_auth_username: '{{ .smtp_username }}'
smtp_auth_password: '{{ .smtp_password }}'
smtp_require_tls: true
# Define notification templates
templates:
- '/etc/alertmanager/templates/*.tmpl'
# Route alerts to appropriate receivers
route:
# Default receiver
receiver: 'default-email'
# Group alerts by these labels
group_by: ['alertname', 'cluster', 'service']
# Wait time before sending initial notification
group_wait: 10s
# Wait time before sending notifications about new alerts in the group
group_interval: 10s
# Wait time before re-sending a notification
repeat_interval: 12h
# Child routes for specific alert routing
routes:
# Critical alerts - send immediately to all channels
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 0s
group_interval: 5m
repeat_interval: 4h
continue: true
# Warning alerts - less urgent
- match:
severity: warning
receiver: 'warning-alerts'
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
# Alert system specific alerts
- match:
component: alert-system
receiver: 'alert-system-team'
group_wait: 10s
repeat_interval: 6h
# Database alerts
- match_re:
alertname: ^(DatabaseConnectionHigh|SlowDatabaseStorage)$
receiver: 'database-team'
group_wait: 30s
repeat_interval: 8h
# Infrastructure alerts
- match_re:
alertname: ^(HighMemoryUsage|ServiceDown)$
receiver: 'infra-team'
group_wait: 30s
repeat_interval: 6h
# Inhibition rules - prevent alert spam
inhibit_rules:
# If service is down, inhibit all other alerts for that service
- source_match:
alertname: 'ServiceDown'
target_match_re:
alertname: '(HighErrorRate|HighResponseTime|HighMemoryUsage)'
equal: ['service']
# If AlertSystem is completely down, inhibit component alerts
- source_match:
alertname: 'AlertSystemDown'
target_match_re:
alertname: 'AlertSystemComponent.*'
equal: ['namespace']
# If RabbitMQ is down, inhibit alert processing errors
- source_match:
alertname: 'RabbitMQConnectionDown'
target_match:
alertname: 'HighAlertProcessingErrorRate'
equal: ['namespace']
# Receivers - notification destinations
receivers:
# Default email receiver
- name: 'default-email'
email_configs:
- to: 'alerts@yourdomain.com'
headers:
Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
html: |
{{ range .Alerts }}
<h2>{{ .Labels.alertname }}</h2>
<p><strong>Status:</strong> {{ .Status }}</p>
<p><strong>Severity:</strong> {{ .Labels.severity }}</p>
<p><strong>Service:</strong> {{ .Labels.service }}</p>
<p><strong>Summary:</strong> {{ .Annotations.summary }}</p>
<p><strong>Description:</strong> {{ .Annotations.description }}</p>
<p><strong>Started:</strong> {{ .StartsAt }}</p>
{{ if .EndsAt }}<p><strong>Ended:</strong> {{ .EndsAt }}</p>{{ end }}
{{ end }}
# Critical alerts - multiple channels
- name: 'critical-alerts'
email_configs:
- to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com'
headers:
Subject: '🚨 [CRITICAL] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
send_resolved: true
# Uncomment to enable Slack notifications
# slack_configs:
# - api_url: '{{ .slack_webhook_url }}'
# channel: '#alerts-critical'
# title: '🚨 Critical Alert'
# text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
# send_resolved: true
# Warning alerts
- name: 'warning-alerts'
email_configs:
- to: 'alerts@yourdomain.com'
headers:
Subject: '⚠️ [WARNING] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
send_resolved: true
# Alert system team
- name: 'alert-system-team'
email_configs:
- to: 'alert-system-team@yourdomain.com'
headers:
Subject: '[Alert System] {{ .GroupLabels.alertname }}'
send_resolved: true
# Database team
- name: 'database-team'
email_configs:
- to: 'database-team@yourdomain.com'
headers:
Subject: '[Database] {{ .GroupLabels.alertname }}'
send_resolved: true
# Infrastructure team
- name: 'infra-team'
email_configs:
- to: 'infra-team@yourdomain.com'
headers:
Subject: '[Infrastructure] {{ .GroupLabels.alertname }}'
send_resolved: true
---
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-templates
namespace: monitoring
data:
default.tmpl: |
{{ define "cluster" }}{{ .ExternalURL | reReplaceAll ".*alertmanager\\.(.*)" "$1" }}{{ end }}
{{ define "slack.default.title" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.alertname }}
{{ end }}
{{ define "slack.default.text" }}
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Severity:* `{{ .Labels.severity }}`
*Service:* `{{ .Labels.service }}`
{{ end }}
{{ end }}
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: alertmanager
namespace: monitoring
labels:
app: alertmanager
spec:
serviceName: alertmanager
replicas: 3
selector:
matchLabels:
app: alertmanager
template:
metadata:
labels:
app: alertmanager
spec:
serviceAccountName: prometheus
initContainers:
- name: init-config
image: busybox:1.36
command: ['/bin/sh', '/scripts/init-config.sh']
env:
- name: SMTP_HOST
valueFrom:
secretKeyRef:
name: alertmanager-secrets
key: smtp-host
- name: SMTP_USERNAME
valueFrom:
secretKeyRef:
name: alertmanager-secrets
key: smtp-username
- name: SMTP_PASSWORD
valueFrom:
secretKeyRef:
name: alertmanager-secrets
key: smtp-password
- name: SMTP_FROM
valueFrom:
secretKeyRef:
name: alertmanager-secrets
key: smtp-from
- name: SLACK_WEBHOOK_URL
valueFrom:
secretKeyRef:
name: alertmanager-secrets
key: slack-webhook-url
optional: true
volumeMounts:
- name: init-script
mountPath: /scripts
- name: config-template
mountPath: /etc/alertmanager-template
- name: config-final
mountPath: /etc/alertmanager-final
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- alertmanager
topologyKey: kubernetes.io/hostname
containers:
- name: alertmanager
image: prom/alertmanager:v0.27.0
args:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--cluster.listen-address=0.0.0.0:9094'
- '--cluster.peer=alertmanager-0.alertmanager.monitoring.svc.cluster.local:9094'
- '--cluster.peer=alertmanager-1.alertmanager.monitoring.svc.cluster.local:9094'
- '--cluster.peer=alertmanager-2.alertmanager.monitoring.svc.cluster.local:9094'
- '--cluster.reconnect-timeout=5m'
- '--web.external-url=http://monitoring.bakery-ia.local/alertmanager'
- '--web.route-prefix=/'
ports:
- name: web
containerPort: 9093
- name: mesh-tcp
containerPort: 9094
- name: mesh-udp
containerPort: 9094
protocol: UDP
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: config-final
mountPath: /etc/alertmanager
- name: templates
mountPath: /etc/alertmanager/templates
- name: storage
mountPath: /alertmanager
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "500m"
livenessProbe:
httpGet:
path: /-/healthy
port: 9093
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /-/ready
port: 9093
initialDelaySeconds: 5
periodSeconds: 5
# Config reloader sidecar
- name: configmap-reload
image: jimmidyson/configmap-reload:v0.12.0
args:
- '--webhook-url=http://localhost:9093/-/reload'
- '--volume-dir=/etc/alertmanager'
volumeMounts:
- name: config-final
mountPath: /etc/alertmanager
readOnly: true
resources:
requests:
memory: "16Mi"
cpu: "10m"
limits:
memory: "32Mi"
cpu: "50m"
volumes:
- name: init-script
configMap:
name: alertmanager-init-script
defaultMode: 0755
- name: config-template
configMap:
name: alertmanager-config
- name: config-final
emptyDir: {}
- name: templates
configMap:
name: alertmanager-templates
volumeClaimTemplates:
- metadata:
name: storage
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 2Gi
---
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: monitoring
labels:
app: alertmanager
spec:
type: ClusterIP
clusterIP: None
ports:
- name: web
port: 9093
targetPort: 9093
- name: mesh-tcp
port: 9094
targetPort: 9094
- name: mesh-udp
port: 9094
targetPort: 9094
protocol: UDP
selector:
app: alertmanager
---
apiVersion: v1
kind: Service
metadata:
name: alertmanager-external
namespace: monitoring
labels:
app: alertmanager
spec:
type: ClusterIP
ports:
- name: web
port: 9093
targetPort: 9093
selector:
app: alertmanager