392 lines
11 KiB
YAML
392 lines
11 KiB
YAML
|
|
---
|
||
|
|
apiVersion: v1
|
||
|
|
kind: ConfigMap
|
||
|
|
metadata:
|
||
|
|
name: alertmanager-config
|
||
|
|
namespace: monitoring
|
||
|
|
data:
|
||
|
|
alertmanager.yml: |
|
||
|
|
global:
|
||
|
|
resolve_timeout: 5m
|
||
|
|
smtp_smarthost: '{{ .smtp_host }}'
|
||
|
|
smtp_from: '{{ .smtp_from }}'
|
||
|
|
smtp_auth_username: '{{ .smtp_username }}'
|
||
|
|
smtp_auth_password: '{{ .smtp_password }}'
|
||
|
|
smtp_require_tls: true
|
||
|
|
|
||
|
|
# Define notification templates
|
||
|
|
templates:
|
||
|
|
- '/etc/alertmanager/templates/*.tmpl'
|
||
|
|
|
||
|
|
# Route alerts to appropriate receivers
|
||
|
|
route:
|
||
|
|
# Default receiver
|
||
|
|
receiver: 'default-email'
|
||
|
|
# Group alerts by these labels
|
||
|
|
group_by: ['alertname', 'cluster', 'service']
|
||
|
|
# Wait time before sending initial notification
|
||
|
|
group_wait: 10s
|
||
|
|
# Wait time before sending notifications about new alerts in the group
|
||
|
|
group_interval: 10s
|
||
|
|
# Wait time before re-sending a notification
|
||
|
|
repeat_interval: 12h
|
||
|
|
|
||
|
|
# Child routes for specific alert routing
|
||
|
|
routes:
|
||
|
|
# Critical alerts - send immediately to all channels
|
||
|
|
- match:
|
||
|
|
severity: critical
|
||
|
|
receiver: 'critical-alerts'
|
||
|
|
group_wait: 0s
|
||
|
|
group_interval: 5m
|
||
|
|
repeat_interval: 4h
|
||
|
|
continue: true
|
||
|
|
|
||
|
|
# Warning alerts - less urgent
|
||
|
|
- match:
|
||
|
|
severity: warning
|
||
|
|
receiver: 'warning-alerts'
|
||
|
|
group_wait: 30s
|
||
|
|
group_interval: 5m
|
||
|
|
repeat_interval: 12h
|
||
|
|
|
||
|
|
# Alert system specific alerts
|
||
|
|
- match:
|
||
|
|
component: alert-system
|
||
|
|
receiver: 'alert-system-team'
|
||
|
|
group_wait: 10s
|
||
|
|
repeat_interval: 6h
|
||
|
|
|
||
|
|
# Database alerts
|
||
|
|
- match_re:
|
||
|
|
alertname: ^(DatabaseConnectionHigh|SlowDatabaseStorage)$
|
||
|
|
receiver: 'database-team'
|
||
|
|
group_wait: 30s
|
||
|
|
repeat_interval: 8h
|
||
|
|
|
||
|
|
# Infrastructure alerts
|
||
|
|
- match_re:
|
||
|
|
alertname: ^(HighMemoryUsage|ServiceDown)$
|
||
|
|
receiver: 'infra-team'
|
||
|
|
group_wait: 30s
|
||
|
|
repeat_interval: 6h
|
||
|
|
|
||
|
|
# Inhibition rules - prevent alert spam
|
||
|
|
inhibit_rules:
|
||
|
|
# If service is down, inhibit all other alerts for that service
|
||
|
|
- source_match:
|
||
|
|
alertname: 'ServiceDown'
|
||
|
|
target_match_re:
|
||
|
|
alertname: '(HighErrorRate|HighResponseTime|HighMemoryUsage)'
|
||
|
|
equal: ['service']
|
||
|
|
|
||
|
|
# If AlertSystem is completely down, inhibit component alerts
|
||
|
|
- source_match:
|
||
|
|
alertname: 'AlertSystemDown'
|
||
|
|
target_match_re:
|
||
|
|
alertname: 'AlertSystemComponent.*'
|
||
|
|
equal: ['namespace']
|
||
|
|
|
||
|
|
# If RabbitMQ is down, inhibit alert processing errors
|
||
|
|
- source_match:
|
||
|
|
alertname: 'RabbitMQConnectionDown'
|
||
|
|
target_match:
|
||
|
|
alertname: 'HighAlertProcessingErrorRate'
|
||
|
|
equal: ['namespace']
|
||
|
|
|
||
|
|
# Receivers - notification destinations
|
||
|
|
receivers:
|
||
|
|
# Default email receiver
|
||
|
|
- name: 'default-email'
|
||
|
|
email_configs:
|
||
|
|
- to: 'alerts@yourdomain.com'
|
||
|
|
headers:
|
||
|
|
Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
|
||
|
|
html: |
|
||
|
|
{{ range .Alerts }}
|
||
|
|
<h2>{{ .Labels.alertname }}</h2>
|
||
|
|
<p><strong>Status:</strong> {{ .Status }}</p>
|
||
|
|
<p><strong>Severity:</strong> {{ .Labels.severity }}</p>
|
||
|
|
<p><strong>Service:</strong> {{ .Labels.service }}</p>
|
||
|
|
<p><strong>Summary:</strong> {{ .Annotations.summary }}</p>
|
||
|
|
<p><strong>Description:</strong> {{ .Annotations.description }}</p>
|
||
|
|
<p><strong>Started:</strong> {{ .StartsAt }}</p>
|
||
|
|
{{ if .EndsAt }}<p><strong>Ended:</strong> {{ .EndsAt }}</p>{{ end }}
|
||
|
|
{{ end }}
|
||
|
|
|
||
|
|
# Critical alerts - multiple channels
|
||
|
|
- name: 'critical-alerts'
|
||
|
|
email_configs:
|
||
|
|
- to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com'
|
||
|
|
headers:
|
||
|
|
Subject: '🚨 [CRITICAL] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
|
||
|
|
send_resolved: true
|
||
|
|
# Uncomment to enable Slack notifications
|
||
|
|
# slack_configs:
|
||
|
|
# - api_url: '{{ .slack_webhook_url }}'
|
||
|
|
# channel: '#alerts-critical'
|
||
|
|
# title: '🚨 Critical Alert'
|
||
|
|
# text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
||
|
|
# send_resolved: true
|
||
|
|
|
||
|
|
# Warning alerts
|
||
|
|
- name: 'warning-alerts'
|
||
|
|
email_configs:
|
||
|
|
- to: 'alerts@yourdomain.com'
|
||
|
|
headers:
|
||
|
|
Subject: '⚠️ [WARNING] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
|
||
|
|
send_resolved: true
|
||
|
|
|
||
|
|
# Alert system team
|
||
|
|
- name: 'alert-system-team'
|
||
|
|
email_configs:
|
||
|
|
- to: 'alert-system-team@yourdomain.com'
|
||
|
|
headers:
|
||
|
|
Subject: '[Alert System] {{ .GroupLabels.alertname }}'
|
||
|
|
send_resolved: true
|
||
|
|
|
||
|
|
# Database team
|
||
|
|
- name: 'database-team'
|
||
|
|
email_configs:
|
||
|
|
- to: 'database-team@yourdomain.com'
|
||
|
|
headers:
|
||
|
|
Subject: '[Database] {{ .GroupLabels.alertname }}'
|
||
|
|
send_resolved: true
|
||
|
|
|
||
|
|
# Infrastructure team
|
||
|
|
- name: 'infra-team'
|
||
|
|
email_configs:
|
||
|
|
- to: 'infra-team@yourdomain.com'
|
||
|
|
headers:
|
||
|
|
Subject: '[Infrastructure] {{ .GroupLabels.alertname }}'
|
||
|
|
send_resolved: true
|
||
|
|
|
||
|
|
---
|
||
|
|
apiVersion: v1
|
||
|
|
kind: ConfigMap
|
||
|
|
metadata:
|
||
|
|
name: alertmanager-templates
|
||
|
|
namespace: monitoring
|
||
|
|
data:
|
||
|
|
default.tmpl: |
|
||
|
|
{{ define "cluster" }}{{ .ExternalURL | reReplaceAll ".*alertmanager\\.(.*)" "$1" }}{{ end }}
|
||
|
|
|
||
|
|
{{ define "slack.default.title" }}
|
||
|
|
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.alertname }}
|
||
|
|
{{ end }}
|
||
|
|
|
||
|
|
{{ define "slack.default.text" }}
|
||
|
|
{{ range .Alerts }}
|
||
|
|
*Alert:* {{ .Annotations.summary }}
|
||
|
|
*Description:* {{ .Annotations.description }}
|
||
|
|
*Severity:* `{{ .Labels.severity }}`
|
||
|
|
*Service:* `{{ .Labels.service }}`
|
||
|
|
{{ end }}
|
||
|
|
{{ end }}
|
||
|
|
|
||
|
|
---
|
||
|
|
apiVersion: apps/v1
|
||
|
|
kind: StatefulSet
|
||
|
|
metadata:
|
||
|
|
name: alertmanager
|
||
|
|
namespace: monitoring
|
||
|
|
labels:
|
||
|
|
app: alertmanager
|
||
|
|
spec:
|
||
|
|
serviceName: alertmanager
|
||
|
|
replicas: 3
|
||
|
|
selector:
|
||
|
|
matchLabels:
|
||
|
|
app: alertmanager
|
||
|
|
template:
|
||
|
|
metadata:
|
||
|
|
labels:
|
||
|
|
app: alertmanager
|
||
|
|
spec:
|
||
|
|
serviceAccountName: prometheus
|
||
|
|
initContainers:
|
||
|
|
- name: init-config
|
||
|
|
image: busybox:1.36
|
||
|
|
command: ['/bin/sh', '/scripts/init-config.sh']
|
||
|
|
env:
|
||
|
|
- name: SMTP_HOST
|
||
|
|
valueFrom:
|
||
|
|
secretKeyRef:
|
||
|
|
name: alertmanager-secrets
|
||
|
|
key: smtp-host
|
||
|
|
- name: SMTP_USERNAME
|
||
|
|
valueFrom:
|
||
|
|
secretKeyRef:
|
||
|
|
name: alertmanager-secrets
|
||
|
|
key: smtp-username
|
||
|
|
- name: SMTP_PASSWORD
|
||
|
|
valueFrom:
|
||
|
|
secretKeyRef:
|
||
|
|
name: alertmanager-secrets
|
||
|
|
key: smtp-password
|
||
|
|
- name: SMTP_FROM
|
||
|
|
valueFrom:
|
||
|
|
secretKeyRef:
|
||
|
|
name: alertmanager-secrets
|
||
|
|
key: smtp-from
|
||
|
|
- name: SLACK_WEBHOOK_URL
|
||
|
|
valueFrom:
|
||
|
|
secretKeyRef:
|
||
|
|
name: alertmanager-secrets
|
||
|
|
key: slack-webhook-url
|
||
|
|
optional: true
|
||
|
|
volumeMounts:
|
||
|
|
- name: init-script
|
||
|
|
mountPath: /scripts
|
||
|
|
- name: config-template
|
||
|
|
mountPath: /etc/alertmanager-template
|
||
|
|
- name: config-final
|
||
|
|
mountPath: /etc/alertmanager-final
|
||
|
|
affinity:
|
||
|
|
podAntiAffinity:
|
||
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
||
|
|
- weight: 100
|
||
|
|
podAffinityTerm:
|
||
|
|
labelSelector:
|
||
|
|
matchExpressions:
|
||
|
|
- key: app
|
||
|
|
operator: In
|
||
|
|
values:
|
||
|
|
- alertmanager
|
||
|
|
topologyKey: kubernetes.io/hostname
|
||
|
|
containers:
|
||
|
|
- name: alertmanager
|
||
|
|
image: prom/alertmanager:v0.27.0
|
||
|
|
args:
|
||
|
|
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||
|
|
- '--storage.path=/alertmanager'
|
||
|
|
- '--cluster.listen-address=0.0.0.0:9094'
|
||
|
|
- '--cluster.peer=alertmanager-0.alertmanager.monitoring.svc.cluster.local:9094'
|
||
|
|
- '--cluster.peer=alertmanager-1.alertmanager.monitoring.svc.cluster.local:9094'
|
||
|
|
- '--cluster.peer=alertmanager-2.alertmanager.monitoring.svc.cluster.local:9094'
|
||
|
|
- '--cluster.reconnect-timeout=5m'
|
||
|
|
- '--web.external-url=http://monitoring.bakery-ia.local/alertmanager'
|
||
|
|
- '--web.route-prefix=/'
|
||
|
|
ports:
|
||
|
|
- name: web
|
||
|
|
containerPort: 9093
|
||
|
|
- name: mesh-tcp
|
||
|
|
containerPort: 9094
|
||
|
|
- name: mesh-udp
|
||
|
|
containerPort: 9094
|
||
|
|
protocol: UDP
|
||
|
|
env:
|
||
|
|
- name: POD_NAME
|
||
|
|
valueFrom:
|
||
|
|
fieldRef:
|
||
|
|
fieldPath: metadata.name
|
||
|
|
volumeMounts:
|
||
|
|
- name: config-final
|
||
|
|
mountPath: /etc/alertmanager
|
||
|
|
- name: templates
|
||
|
|
mountPath: /etc/alertmanager/templates
|
||
|
|
- name: storage
|
||
|
|
mountPath: /alertmanager
|
||
|
|
resources:
|
||
|
|
requests:
|
||
|
|
memory: "128Mi"
|
||
|
|
cpu: "100m"
|
||
|
|
limits:
|
||
|
|
memory: "256Mi"
|
||
|
|
cpu: "500m"
|
||
|
|
livenessProbe:
|
||
|
|
httpGet:
|
||
|
|
path: /-/healthy
|
||
|
|
port: 9093
|
||
|
|
initialDelaySeconds: 30
|
||
|
|
periodSeconds: 10
|
||
|
|
readinessProbe:
|
||
|
|
httpGet:
|
||
|
|
path: /-/ready
|
||
|
|
port: 9093
|
||
|
|
initialDelaySeconds: 5
|
||
|
|
periodSeconds: 5
|
||
|
|
|
||
|
|
# Config reloader sidecar
|
||
|
|
- name: configmap-reload
|
||
|
|
image: jimmidyson/configmap-reload:v0.12.0
|
||
|
|
args:
|
||
|
|
- '--webhook-url=http://localhost:9093/-/reload'
|
||
|
|
- '--volume-dir=/etc/alertmanager'
|
||
|
|
volumeMounts:
|
||
|
|
- name: config-final
|
||
|
|
mountPath: /etc/alertmanager
|
||
|
|
readOnly: true
|
||
|
|
resources:
|
||
|
|
requests:
|
||
|
|
memory: "16Mi"
|
||
|
|
cpu: "10m"
|
||
|
|
limits:
|
||
|
|
memory: "32Mi"
|
||
|
|
cpu: "50m"
|
||
|
|
|
||
|
|
volumes:
|
||
|
|
- name: init-script
|
||
|
|
configMap:
|
||
|
|
name: alertmanager-init-script
|
||
|
|
defaultMode: 0755
|
||
|
|
- name: config-template
|
||
|
|
configMap:
|
||
|
|
name: alertmanager-config
|
||
|
|
- name: config-final
|
||
|
|
emptyDir: {}
|
||
|
|
- name: templates
|
||
|
|
configMap:
|
||
|
|
name: alertmanager-templates
|
||
|
|
|
||
|
|
volumeClaimTemplates:
|
||
|
|
- metadata:
|
||
|
|
name: storage
|
||
|
|
spec:
|
||
|
|
accessModes: [ "ReadWriteOnce" ]
|
||
|
|
resources:
|
||
|
|
requests:
|
||
|
|
storage: 2Gi
|
||
|
|
|
||
|
|
---
|
||
|
|
apiVersion: v1
|
||
|
|
kind: Service
|
||
|
|
metadata:
|
||
|
|
name: alertmanager
|
||
|
|
namespace: monitoring
|
||
|
|
labels:
|
||
|
|
app: alertmanager
|
||
|
|
spec:
|
||
|
|
type: ClusterIP
|
||
|
|
clusterIP: None
|
||
|
|
ports:
|
||
|
|
- name: web
|
||
|
|
port: 9093
|
||
|
|
targetPort: 9093
|
||
|
|
- name: mesh-tcp
|
||
|
|
port: 9094
|
||
|
|
targetPort: 9094
|
||
|
|
- name: mesh-udp
|
||
|
|
port: 9094
|
||
|
|
targetPort: 9094
|
||
|
|
protocol: UDP
|
||
|
|
selector:
|
||
|
|
app: alertmanager
|
||
|
|
|
||
|
|
---
|
||
|
|
apiVersion: v1
|
||
|
|
kind: Service
|
||
|
|
metadata:
|
||
|
|
name: alertmanager-external
|
||
|
|
namespace: monitoring
|
||
|
|
labels:
|
||
|
|
app: alertmanager
|
||
|
|
spec:
|
||
|
|
type: ClusterIP
|
||
|
|
ports:
|
||
|
|
- name: web
|
||
|
|
port: 9093
|
||
|
|
targetPort: 9093
|
||
|
|
selector:
|
||
|
|
app: alertmanager
|