--- apiVersion: v1 kind: ServiceAccount metadata: name: prometheus namespace: monitoring --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: prometheus rules: - apiGroups: [""] resources: - nodes - nodes/proxy - services - endpoints - pods verbs: ["get", "list", "watch"] - apiGroups: - extensions resources: - ingresses verbs: ["get", "list", "watch"] - nonResourceURLs: ["/metrics"] verbs: ["get"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: prometheus roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: prometheus subjects: - kind: ServiceAccount name: prometheus namespace: monitoring --- apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: monitoring data: prometheus.yml: | global: scrape_interval: 30s evaluation_interval: 30s external_labels: cluster: 'bakery-ia' environment: 'production' # AlertManager configuration alerting: alertmanagers: - static_configs: - targets: - alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093 - alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093 - alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093 # Load alert rules rule_files: - '/etc/prometheus/rules/*.yml' scrape_configs: # Scrape Prometheus itself - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] # Scrape all bakery-ia services - job_name: 'bakery-services' kubernetes_sd_configs: - role: pod namespaces: names: - bakery-ia relabel_configs: # Only scrape pods with metrics port - source_labels: [__meta_kubernetes_pod_container_port_name] action: keep regex: http # Add service name label - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] target_label: service # Add component label - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component] target_label: component # Add pod name - source_labels: [__meta_kubernetes_pod_name] target_label: pod # Set metrics path - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) # Set scrape port - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ # Scrape Kubernetes nodes - job_name: 'kubernetes-nodes' kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics # Scrape AlertManager - job_name: 'alertmanager' static_configs: - targets: - alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093 - alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093 - alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093 # Scrape PostgreSQL exporter - job_name: 'postgres-exporter' static_configs: - targets: ['postgres-exporter.monitoring.svc.cluster.local:9187'] # Scrape Node Exporter - job_name: 'node-exporter' kubernetes_sd_configs: - role: node relabel_configs: - source_labels: [__address__] regex: '(.*):10250' replacement: '${1}:9100' target_label: __address__ - source_labels: [__meta_kubernetes_node_name] target_label: node --- apiVersion: apps/v1 kind: StatefulSet metadata: name: prometheus namespace: monitoring labels: app: prometheus spec: serviceName: prometheus replicas: 2 selector: matchLabels: app: prometheus template: metadata: labels: app: prometheus spec: serviceAccountName: prometheus affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchExpressions: - key: app operator: In values: - prometheus topologyKey: kubernetes.io/hostname containers: - name: prometheus image: prom/prometheus:v3.0.1 args: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--storage.tsdb.retention.time=30d' - '--web.console.libraries=/usr/share/prometheus/console_libraries' - '--web.console.templates=/usr/share/prometheus/consoles' - '--web.enable-lifecycle' ports: - containerPort: 9090 name: web volumeMounts: - name: prometheus-config mountPath: /etc/prometheus - name: prometheus-rules mountPath: /etc/prometheus/rules - name: prometheus-storage mountPath: /prometheus resources: requests: memory: "1Gi" cpu: "500m" limits: memory: "2Gi" cpu: "1" livenessProbe: httpGet: path: /-/healthy port: 9090 initialDelaySeconds: 30 periodSeconds: 10 readinessProbe: httpGet: path: /-/ready port: 9090 initialDelaySeconds: 5 periodSeconds: 5 volumes: - name: prometheus-config configMap: name: prometheus-config - name: prometheus-rules configMap: name: prometheus-alert-rules volumeClaimTemplates: - metadata: name: prometheus-storage spec: accessModes: [ "ReadWriteOnce" ] resources: requests: storage: 20Gi --- apiVersion: v1 kind: Service metadata: name: prometheus namespace: monitoring labels: app: prometheus spec: type: ClusterIP clusterIP: None ports: - port: 9090 targetPort: 9090 protocol: TCP name: web selector: app: prometheus --- apiVersion: v1 kind: Service metadata: name: prometheus-external namespace: monitoring labels: app: prometheus spec: type: ClusterIP ports: - port: 9090 targetPort: 9090 protocol: TCP name: web selector: app: prometheus