--- apiVersion: v1 kind: ConfigMap metadata: name: alertmanager-config namespace: monitoring data: alertmanager.yml: | global: resolve_timeout: 5m smtp_smarthost: '{{ .smtp_host }}' smtp_from: '{{ .smtp_from }}' smtp_auth_username: '{{ .smtp_username }}' smtp_auth_password: '{{ .smtp_password }}' smtp_require_tls: true # Define notification templates templates: - '/etc/alertmanager/templates/*.tmpl' # Route alerts to appropriate receivers route: # Default receiver receiver: 'default-email' # Group alerts by these labels group_by: ['alertname', 'cluster', 'service'] # Wait time before sending initial notification group_wait: 10s # Wait time before sending notifications about new alerts in the group group_interval: 10s # Wait time before re-sending a notification repeat_interval: 12h # Child routes for specific alert routing routes: # Critical alerts - send immediately to all channels - match: severity: critical receiver: 'critical-alerts' group_wait: 0s group_interval: 5m repeat_interval: 4h continue: true # Warning alerts - less urgent - match: severity: warning receiver: 'warning-alerts' group_wait: 30s group_interval: 5m repeat_interval: 12h # Alert system specific alerts - match: component: alert-system receiver: 'alert-system-team' group_wait: 10s repeat_interval: 6h # Database alerts - match_re: alertname: ^(DatabaseConnectionHigh|SlowDatabaseStorage)$ receiver: 'database-team' group_wait: 30s repeat_interval: 8h # Infrastructure alerts - match_re: alertname: ^(HighMemoryUsage|ServiceDown)$ receiver: 'infra-team' group_wait: 30s repeat_interval: 6h # Inhibition rules - prevent alert spam inhibit_rules: # If service is down, inhibit all other alerts for that service - source_match: alertname: 'ServiceDown' target_match_re: alertname: '(HighErrorRate|HighResponseTime|HighMemoryUsage)' equal: ['service'] # If AlertSystem is completely down, inhibit component alerts - source_match: alertname: 'AlertSystemDown' target_match_re: alertname: 'AlertSystemComponent.*' equal: ['namespace'] # If RabbitMQ is down, inhibit alert processing errors - source_match: alertname: 'RabbitMQConnectionDown' target_match: alertname: 'HighAlertProcessingErrorRate' equal: ['namespace'] # Receivers - notification destinations receivers: # Default email receiver - name: 'default-email' email_configs: - to: 'alerts@yourdomain.com' headers: Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}' html: | {{ range .Alerts }}
Status: {{ .Status }}
Severity: {{ .Labels.severity }}
Service: {{ .Labels.service }}
Summary: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Started: {{ .StartsAt }}
{{ if .EndsAt }}Ended: {{ .EndsAt }}
{{ end }} {{ end }} # Critical alerts - multiple channels - name: 'critical-alerts' email_configs: - to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com' headers: Subject: '🚨 [CRITICAL] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}' send_resolved: true # Uncomment to enable Slack notifications # slack_configs: # - api_url: '{{ .slack_webhook_url }}' # channel: '#alerts-critical' # title: '🚨 Critical Alert' # text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}' # send_resolved: true # Warning alerts - name: 'warning-alerts' email_configs: - to: 'alerts@yourdomain.com' headers: Subject: '⚠️ [WARNING] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}' send_resolved: true # Alert system team - name: 'alert-system-team' email_configs: - to: 'alert-system-team@yourdomain.com' headers: Subject: '[Alert System] {{ .GroupLabels.alertname }}' send_resolved: true # Database team - name: 'database-team' email_configs: - to: 'database-team@yourdomain.com' headers: Subject: '[Database] {{ .GroupLabels.alertname }}' send_resolved: true # Infrastructure team - name: 'infra-team' email_configs: - to: 'infra-team@yourdomain.com' headers: Subject: '[Infrastructure] {{ .GroupLabels.alertname }}' send_resolved: true --- apiVersion: v1 kind: ConfigMap metadata: name: alertmanager-templates namespace: monitoring data: default.tmpl: | {{ define "cluster" }}{{ .ExternalURL | reReplaceAll ".*alertmanager\\.(.*)" "$1" }}{{ end }} {{ define "slack.default.title" }} [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.alertname }} {{ end }} {{ define "slack.default.text" }} {{ range .Alerts }} *Alert:* {{ .Annotations.summary }} *Description:* {{ .Annotations.description }} *Severity:* `{{ .Labels.severity }}` *Service:* `{{ .Labels.service }}` {{ end }} {{ end }} --- apiVersion: apps/v1 kind: StatefulSet metadata: name: alertmanager namespace: monitoring labels: app: alertmanager spec: serviceName: alertmanager replicas: 3 selector: matchLabels: app: alertmanager template: metadata: labels: app: alertmanager spec: serviceAccountName: prometheus initContainers: - name: init-config image: busybox:1.36 command: ['/bin/sh', '/scripts/init-config.sh'] env: - name: SMTP_HOST valueFrom: secretKeyRef: name: alertmanager-secrets key: smtp-host - name: SMTP_USERNAME valueFrom: secretKeyRef: name: alertmanager-secrets key: smtp-username - name: SMTP_PASSWORD valueFrom: secretKeyRef: name: alertmanager-secrets key: smtp-password - name: SMTP_FROM valueFrom: secretKeyRef: name: alertmanager-secrets key: smtp-from - name: SLACK_WEBHOOK_URL valueFrom: secretKeyRef: name: alertmanager-secrets key: slack-webhook-url optional: true volumeMounts: - name: init-script mountPath: /scripts - name: config-template mountPath: /etc/alertmanager-template - name: config-final mountPath: /etc/alertmanager-final affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchExpressions: - key: app operator: In values: - alertmanager topologyKey: kubernetes.io/hostname containers: - name: alertmanager image: prom/alertmanager:v0.27.0 args: - '--config.file=/etc/alertmanager/alertmanager.yml' - '--storage.path=/alertmanager' - '--cluster.listen-address=0.0.0.0:9094' - '--cluster.peer=alertmanager-0.alertmanager.monitoring.svc.cluster.local:9094' - '--cluster.peer=alertmanager-1.alertmanager.monitoring.svc.cluster.local:9094' - '--cluster.peer=alertmanager-2.alertmanager.monitoring.svc.cluster.local:9094' - '--cluster.reconnect-timeout=5m' - '--web.external-url=http://monitoring.bakery-ia.local/alertmanager' - '--web.route-prefix=/' ports: - name: web containerPort: 9093 - name: mesh-tcp containerPort: 9094 - name: mesh-udp containerPort: 9094 protocol: UDP env: - name: POD_NAME valueFrom: fieldRef: fieldPath: metadata.name volumeMounts: - name: config-final mountPath: /etc/alertmanager - name: templates mountPath: /etc/alertmanager/templates - name: storage mountPath: /alertmanager resources: requests: memory: "128Mi" cpu: "100m" limits: memory: "256Mi" cpu: "500m" livenessProbe: httpGet: path: /-/healthy port: 9093 initialDelaySeconds: 30 periodSeconds: 10 readinessProbe: httpGet: path: /-/ready port: 9093 initialDelaySeconds: 5 periodSeconds: 5 # Config reloader sidecar - name: configmap-reload image: jimmidyson/configmap-reload:v0.12.0 args: - '--webhook-url=http://localhost:9093/-/reload' - '--volume-dir=/etc/alertmanager' volumeMounts: - name: config-final mountPath: /etc/alertmanager readOnly: true resources: requests: memory: "16Mi" cpu: "10m" limits: memory: "32Mi" cpu: "50m" volumes: - name: init-script configMap: name: alertmanager-init-script defaultMode: 0755 - name: config-template configMap: name: alertmanager-config - name: config-final emptyDir: {} - name: templates configMap: name: alertmanager-templates volumeClaimTemplates: - metadata: name: storage spec: accessModes: [ "ReadWriteOnce" ] resources: requests: storage: 2Gi --- apiVersion: v1 kind: Service metadata: name: alertmanager namespace: monitoring labels: app: alertmanager spec: type: ClusterIP clusterIP: None ports: - name: web port: 9093 targetPort: 9093 - name: mesh-tcp port: 9094 targetPort: 9094 - name: mesh-udp port: 9094 targetPort: 9094 protocol: UDP selector: app: alertmanager --- apiVersion: v1 kind: Service metadata: name: alertmanager-external namespace: monitoring labels: app: alertmanager spec: type: ClusterIP ports: - name: web port: 9093 targetPort: 9093 selector: app: alertmanager