Improve docker config

This commit is contained in:
Urtzi Alfaro
2025-07-20 02:16:51 +02:00
parent 9a67f3d175
commit 1c730c3c81
27 changed files with 2598 additions and 1161 deletions

View File

@@ -1,11 +1,15 @@
# infrastructure/monitoring/grafana/dashboards/dashboard.yml
# Grafana dashboard provisioning
apiVersion: 1
providers:
- name: 'Bakery Forecasting'
- name: 'bakery-dashboards'
orgId: 1
folder: ''
folder: 'Bakery Forecasting'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards

View File

@@ -1,3 +1,6 @@
# infrastructure/monitoring/grafana/datasources/prometheus.yml
# Grafana Prometheus datasource configuration
apiVersion: 1
datasources:
@@ -6,4 +9,20 @@ datasources:
access: proxy
url: http://prometheus:9090
isDefault: true
version: 1
editable: true
jsonData:
timeInterval: "15s"
queryTimeout: "60s"
httpMethod: "POST"
exemplarTraceIdDestinations:
- name: trace_id
datasourceUid: jaeger
- name: Jaeger
type: jaeger
access: proxy
url: http://jaeger:16686
uid: jaeger
version: 1
editable: true

View File

@@ -1,17 +1,30 @@
---
# infrastructure/monitoring/prometheus/prometheus.yml
# Prometheus configuration
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'bakery-forecasting'
replica: 'prometheus-01'
rule_files:
- "alerts.yml"
- "/etc/prometheus/rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
scrape_configs:
# Service discovery for microservices
- job_name: 'gateway'
static_configs:
- targets: ['gateway:8000']
metrics_path: '/metrics'
scrape_interval: 30s
scrape_timeout: 10s
- job_name: 'auth-service'
static_configs:
@@ -49,11 +62,21 @@ scrape_configs:
metrics_path: '/metrics'
scrape_interval: 30s
# Infrastructure monitoring
- job_name: 'redis'
static_configs:
- targets: ['redis:6379']
metrics_path: '/metrics'
scrape_interval: 30s
- job_name: 'rabbitmq'
static_configs:
- targets: ['rabbitmq:15692']
metrics_path: '/metrics'
scrape_interval: 30s
# Database monitoring (requires postgres_exporter)
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
scrape_interval: 30s

View File

@@ -0,0 +1,86 @@
# infrastructure/monitoring/prometheus/rules/alerts.yml
# Prometheus alerting rules
groups:
- name: bakery_services
rules:
# Service availability alerts
- alert: ServiceDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "Service {{ $labels.job }} has been down for more than 2 minutes."
# High error rate alerts
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate on {{ $labels.job }}"
description: "Error rate is {{ $value }} errors per second on {{ $labels.job }}."
# High response time alerts
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High response time on {{ $labels.job }}"
description: "95th percentile response time is {{ $value }}s on {{ $labels.job }}."
# Memory usage alerts
- alert: HighMemoryUsage
expr: process_resident_memory_bytes / 1024 / 1024 > 500
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.job }}"
description: "Memory usage is {{ $value }}MB on {{ $labels.job }}."
# Database connection alerts
- alert: DatabaseConnectionHigh
expr: pg_stat_activity_count > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High database connections"
description: "Database has {{ $value }} active connections."
- name: bakery_business
rules:
# Training job alerts
- alert: TrainingJobFailed
expr: increase(training_jobs_failed_total[1h]) > 0
labels:
severity: warning
annotations:
summary: "Training job failed"
description: "{{ $value }} training jobs have failed in the last hour."
# Prediction accuracy alerts
- alert: LowPredictionAccuracy
expr: prediction_accuracy < 0.7
for: 15m
labels:
severity: warning
annotations:
summary: "Low prediction accuracy"
description: "Prediction accuracy is {{ $value }} for tenant {{ $labels.tenant_id }}."
# API rate limit alerts
- alert: APIRateLimitHit
expr: increase(rate_limit_hits_total[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "API rate limit hit frequently"
description: "Rate limit has been hit {{ $value }} times in 5 minutes."