Improve docker config

2025-07-20 02:16:51 +02:00
parent 9a67f3d175
commit 1c730c3c81
27 changed files with 2598 additions and 1161 deletions
--- a/infrastructure/monitoring/grafana/dashboards/dashboard.yml
+++ b/infrastructure/monitoring/grafana/dashboards/dashboard.yml
@@ -1,11 +1,15 @@
+# infrastructure/monitoring/grafana/dashboards/dashboard.yml
+# Grafana dashboard provisioning
+
 apiVersion: 1

 providers:
-  - name: 'Bakery Forecasting'
+  - name: 'bakery-dashboards'
    orgId: 1
-    folder: ''
+    folder: 'Bakery Forecasting'
    type: file
    disableDeletion: false
    updateIntervalSeconds: 10
+    allowUiUpdates: true
    options:
      path: /etc/grafana/provisioning/dashboards
--- a/infrastructure/monitoring/grafana/datasources/prometheus.yml
+++ b/infrastructure/monitoring/grafana/datasources/prometheus.yml
@@ -1,3 +1,6 @@
+# infrastructure/monitoring/grafana/datasources/prometheus.yml
+# Grafana Prometheus datasource configuration
+
 apiVersion: 1

 datasources:
@@ -6,4 +9,20 @@ datasources:
    access: proxy
    url: http://prometheus:9090
    isDefault: true
+    version: 1
+    editable: true
+    jsonData:
+      timeInterval: "15s"
+      queryTimeout: "60s"
+      httpMethod: "POST"
+      exemplarTraceIdDestinations:
+        - name: trace_id
+          datasourceUid: jaeger
+
+  - name: Jaeger
+    type: jaeger
+    access: proxy
+    url: http://jaeger:16686
+    uid: jaeger
+    version: 1
    editable: true
--- a/infrastructure/monitoring/prometheus/prometheus.yml
+++ b/infrastructure/monitoring/prometheus/prometheus.yml
@@ -1,17 +1,30 @@
---
+# infrastructure/monitoring/prometheus/prometheus.yml
+# Prometheus configuration
+
 global:
  scrape_interval: 15s
  evaluation_interval: 15s
+  external_labels:
+    cluster: 'bakery-forecasting'
+    replica: 'prometheus-01'

 rule_files:
-  - "alerts.yml"
+  - "/etc/prometheus/rules/*.yml"
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+          # - alertmanager:9093

 scrape_configs:
+  # Service discovery for microservices
  - job_name: 'gateway'
    static_configs:
      - targets: ['gateway:8000']
    metrics_path: '/metrics'
    scrape_interval: 30s
+    scrape_timeout: 10s

  - job_name: 'auth-service'
    static_configs:
@@ -49,11 +62,21 @@ scrape_configs:
    metrics_path: '/metrics'
    scrape_interval: 30s

+  # Infrastructure monitoring
  - job_name: 'redis'
    static_configs:
      - targets: ['redis:6379']
+    metrics_path: '/metrics'
+    scrape_interval: 30s

  - job_name: 'rabbitmq'
    static_configs:
      - targets: ['rabbitmq:15692']
+    metrics_path: '/metrics'
+    scrape_interval: 30s

+  # Database monitoring (requires postgres_exporter)
+  - job_name: 'postgres'
+    static_configs:
+      - targets: ['postgres-exporter:9187']
+    scrape_interval: 30s
--- a/infrastructure/monitoring/prometheus/rules/alerts.yml
+++ b/infrastructure/monitoring/prometheus/rules/alerts.yml
@@ -0,0 +1,86 @@
+# infrastructure/monitoring/prometheus/rules/alerts.yml
+# Prometheus alerting rules
+
+groups:
+  - name: bakery_services
+    rules:
+      # Service availability alerts
+      - alert: ServiceDown
+        expr: up == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Service {{ $labels.job }} is down"
+          description: "Service {{ $labels.job }} has been down for more than 2 minutes."
+
+      # High error rate alerts
+      - alert: HighErrorRate
+        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High error rate on {{ $labels.job }}"
+          description: "Error rate is {{ $value }} errors per second on {{ $labels.job }}."
+
+      # High response time alerts
+      - alert: HighResponseTime
+        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High response time on {{ $labels.job }}"
+          description: "95th percentile response time is {{ $value }}s on {{ $labels.job }}."
+
+      # Memory usage alerts
+      - alert: HighMemoryUsage
+        expr: process_resident_memory_bytes / 1024 / 1024 > 500
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High memory usage on {{ $labels.job }}"
+          description: "Memory usage is {{ $value }}MB on {{ $labels.job }}."
+
+      # Database connection alerts
+      - alert: DatabaseConnectionHigh
+        expr: pg_stat_activity_count > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High database connections"
+          description: "Database has {{ $value }} active connections."
+
+  - name: bakery_business
+    rules:
+      # Training job alerts
+      - alert: TrainingJobFailed
+        expr: increase(training_jobs_failed_total[1h]) > 0
+        labels:
+          severity: warning
+        annotations:
+          summary: "Training job failed"
+          description: "{{ $value }} training jobs have failed in the last hour."
+
+      # Prediction accuracy alerts
+      - alert: LowPredictionAccuracy
+        expr: prediction_accuracy < 0.7
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Low prediction accuracy"
+          description: "Prediction accuracy is {{ $value }} for tenant {{ $labels.tenant_id }}."
+
+      # API rate limit alerts
+      - alert: APIRateLimitHit
+        expr: increase(rate_limit_hits_total[5m]) > 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "API rate limit hit frequently"
+          description: "Rate limit has been hit {{ $value }} times in 5 minutes."