Files
bakery-ia/infrastructure/kubernetes/base/components/monitoring/prometheus.yaml

279 lines
6.9 KiB
YAML
Raw Normal View History

---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 30s
evaluation_interval: 30s
external_labels:
cluster: 'bakery-ia'
environment: 'production'
2026-01-07 19:12:35 +01:00
# AlertManager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093
- alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093
- alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093
# Load alert rules
rule_files:
- '/etc/prometheus/rules/*.yml'
scrape_configs:
# Scrape Prometheus itself
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Scrape all bakery-ia services
- job_name: 'bakery-services'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- bakery-ia
relabel_configs:
# Only scrape pods with metrics port
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: http
# Add service name label
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
target_label: service
# Add component label
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
target_label: component
# Add pod name
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
# Set metrics path
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
# Set scrape port
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
# Scrape Kubernetes nodes
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
2026-01-07 19:12:35 +01:00
# Scrape AlertManager
- job_name: 'alertmanager'
static_configs:
- targets:
- alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093
- alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093
- alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093
# Scrape PostgreSQL exporter
- job_name: 'postgres-exporter'
static_configs:
- targets: ['postgres-exporter.monitoring.svc.cluster.local:9187']
# Scrape Node Exporter
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
- source_labels: [__meta_kubernetes_node_name]
target_label: node
---
apiVersion: apps/v1
2026-01-07 19:12:35 +01:00
kind: StatefulSet
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
2026-01-07 19:12:35 +01:00
serviceName: prometheus
replicas: 2
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
2026-01-07 19:12:35 +01:00
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- prometheus
topologyKey: kubernetes.io/hostname
containers:
- name: prometheus
image: prom/prometheus:v3.0.1
args:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--web.enable-lifecycle'
ports:
- containerPort: 9090
name: web
volumeMounts:
- name: prometheus-config
mountPath: /etc/prometheus
2026-01-07 19:12:35 +01:00
- name: prometheus-rules
mountPath: /etc/prometheus/rules
- name: prometheus-storage
mountPath: /prometheus
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1"
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
2026-01-07 19:12:35 +01:00
- name: prometheus-rules
configMap:
name: prometheus-alert-rules
volumeClaimTemplates:
- metadata:
name: prometheus-storage
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 20Gi
---
apiVersion: v1
2026-01-07 19:12:35 +01:00
kind: Service
metadata:
2026-01-07 19:12:35 +01:00
name: prometheus
namespace: monitoring
2026-01-07 19:12:35 +01:00
labels:
app: prometheus
spec:
2026-01-07 19:12:35 +01:00
type: ClusterIP
clusterIP: None
ports:
- port: 9090
targetPort: 9090
protocol: TCP
name: web
selector:
app: prometheus
---
apiVersion: v1
kind: Service
metadata:
2026-01-07 19:12:35 +01:00
name: prometheus-external
namespace: monitoring
labels:
app: prometheus
spec:
type: ClusterIP
ports:
- port: 9090
targetPort: 9090
protocol: TCP
name: web
selector:
app: prometheus