Improve docker config
This commit is contained in:
@@ -1,11 +1,15 @@
|
||||
# infrastructure/monitoring/grafana/dashboards/dashboard.yml
|
||||
# Grafana dashboard provisioning
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Bakery Forecasting'
|
||||
- name: 'bakery-dashboards'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
folder: 'Bakery Forecasting'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards
|
||||
@@ -1,3 +1,6 @@
|
||||
# infrastructure/monitoring/grafana/datasources/prometheus.yml
|
||||
# Grafana Prometheus datasource configuration
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
@@ -6,4 +9,20 @@ datasources:
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
version: 1
|
||||
editable: true
|
||||
jsonData:
|
||||
timeInterval: "15s"
|
||||
queryTimeout: "60s"
|
||||
httpMethod: "POST"
|
||||
exemplarTraceIdDestinations:
|
||||
- name: trace_id
|
||||
datasourceUid: jaeger
|
||||
|
||||
- name: Jaeger
|
||||
type: jaeger
|
||||
access: proxy
|
||||
url: http://jaeger:16686
|
||||
uid: jaeger
|
||||
version: 1
|
||||
editable: true
|
||||
@@ -1,17 +1,30 @@
|
||||
---
|
||||
# infrastructure/monitoring/prometheus/prometheus.yml
|
||||
# Prometheus configuration
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'bakery-forecasting'
|
||||
replica: 'prometheus-01'
|
||||
|
||||
rule_files:
|
||||
- "alerts.yml"
|
||||
- "/etc/prometheus/rules/*.yml"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
# - alertmanager:9093
|
||||
|
||||
scrape_configs:
|
||||
# Service discovery for microservices
|
||||
- job_name: 'gateway'
|
||||
static_configs:
|
||||
- targets: ['gateway:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 10s
|
||||
|
||||
- job_name: 'auth-service'
|
||||
static_configs:
|
||||
@@ -49,11 +62,21 @@ scrape_configs:
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
# Infrastructure monitoring
|
||||
- job_name: 'redis'
|
||||
static_configs:
|
||||
- targets: ['redis:6379']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'rabbitmq'
|
||||
static_configs:
|
||||
- targets: ['rabbitmq:15692']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
# Database monitoring (requires postgres_exporter)
|
||||
- job_name: 'postgres'
|
||||
static_configs:
|
||||
- targets: ['postgres-exporter:9187']
|
||||
scrape_interval: 30s
|
||||
86
infrastructure/monitoring/prometheus/rules/alerts.yml
Normal file
86
infrastructure/monitoring/prometheus/rules/alerts.yml
Normal file
@@ -0,0 +1,86 @@
|
||||
# infrastructure/monitoring/prometheus/rules/alerts.yml
|
||||
# Prometheus alerting rules
|
||||
|
||||
groups:
|
||||
- name: bakery_services
|
||||
rules:
|
||||
# Service availability alerts
|
||||
- alert: ServiceDown
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service {{ $labels.job }} is down"
|
||||
description: "Service {{ $labels.job }} has been down for more than 2 minutes."
|
||||
|
||||
# High error rate alerts
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate on {{ $labels.job }}"
|
||||
description: "Error rate is {{ $value }} errors per second on {{ $labels.job }}."
|
||||
|
||||
# High response time alerts
|
||||
- alert: HighResponseTime
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High response time on {{ $labels.job }}"
|
||||
description: "95th percentile response time is {{ $value }}s on {{ $labels.job }}."
|
||||
|
||||
# Memory usage alerts
|
||||
- alert: HighMemoryUsage
|
||||
expr: process_resident_memory_bytes / 1024 / 1024 > 500
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.job }}"
|
||||
description: "Memory usage is {{ $value }}MB on {{ $labels.job }}."
|
||||
|
||||
# Database connection alerts
|
||||
- alert: DatabaseConnectionHigh
|
||||
expr: pg_stat_activity_count > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High database connections"
|
||||
description: "Database has {{ $value }} active connections."
|
||||
|
||||
- name: bakery_business
|
||||
rules:
|
||||
# Training job alerts
|
||||
- alert: TrainingJobFailed
|
||||
expr: increase(training_jobs_failed_total[1h]) > 0
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Training job failed"
|
||||
description: "{{ $value }} training jobs have failed in the last hour."
|
||||
|
||||
# Prediction accuracy alerts
|
||||
- alert: LowPredictionAccuracy
|
||||
expr: prediction_accuracy < 0.7
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low prediction accuracy"
|
||||
description: "Prediction accuracy is {{ $value }} for tenant {{ $labels.tenant_id }}."
|
||||
|
||||
# API rate limit alerts
|
||||
- alert: APIRateLimitHit
|
||||
expr: increase(rate_limit_hits_total[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "API rate limit hit frequently"
|
||||
description: "Rate limit has been hit {{ $value }} times in 5 minutes."
|
||||
Reference in New Issue
Block a user