bakery-ia/infrastructure/monitoring/prometheus/rules/alerts.yml

# infrastructure/monitoring/prometheus/rules/alerts.yml
# Prometheus alerting rules

groups:
  - name: bakery_services
    rules:
      # Service availability alerts
      - alert: ServiceDown
        expr: up == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"
          description: "Service {{ $labels.job }} has been down for more than 2 minutes."

      # High error rate alerts
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.job }}"
          description: "Error rate is {{ $value }} errors per second on {{ $labels.job }}."

      # High response time alerts
      - alert: HighResponseTime
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High response time on {{ $labels.job }}"
          description: "95th percentile response time is {{ $value }}s on {{ $labels.job }}."

      # Memory usage alerts
      - alert: HighMemoryUsage
        expr: process_resident_memory_bytes / 1024 / 1024 > 500
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.job }}"
          description: "Memory usage is {{ $value }}MB on {{ $labels.job }}."

      # Database connection alerts
      - alert: DatabaseConnectionHigh
        expr: pg_stat_activity_count > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High database connections"
          description: "Database has {{ $value }} active connections."

  - name: bakery_business
    rules:
      # Training job alerts
      - alert: TrainingJobFailed
        expr: increase(training_jobs_failed_total[1h]) > 0
        labels:
          severity: warning
        annotations:
          summary: "Training job failed"
          description: "{{ $value }} training jobs have failed in the last hour."

      # Prediction accuracy alerts
      - alert: LowPredictionAccuracy
        expr: prediction_accuracy < 0.7
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "Low prediction accuracy"
          description: "Prediction accuracy is {{ $value }} for tenant {{ $labels.tenant_id }}."

      # API rate limit alerts
      - alert: APIRateLimitHit
        expr: increase(rate_limit_hits_total[5m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "API rate limit hit frequently"
          description: "Rate limit has been hit {{ $value }} times in 5 minutes."
Improve docker config 2025-07-20 02:16:51 +02:00			`# infrastructure/monitoring/prometheus/rules/alerts.yml`
			`# Prometheus alerting rules`

			`groups:`
			`- name: bakery_services`
			`rules:`
			`# Service availability alerts`
			`- alert: ServiceDown`
			`expr: up == 0`
			`for: 2m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: "Service {{ $labels.job }} is down"`
			`description: "Service {{ $labels.job }} has been down for more than 2 minutes."`

			`# High error rate alerts`
			`- alert: HighErrorRate`
			`expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "High error rate on {{ $labels.job }}"`
			`description: "Error rate is {{ $value }} errors per second on {{ $labels.job }}."`

			`# High response time alerts`
			`- alert: HighResponseTime`
			`expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "High response time on {{ $labels.job }}"`
			`description: "95th percentile response time is {{ $value }}s on {{ $labels.job }}."`

			`# Memory usage alerts`
			`- alert: HighMemoryUsage`
			`expr: process_resident_memory_bytes / 1024 / 1024 > 500`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "High memory usage on {{ $labels.job }}"`
			`description: "Memory usage is {{ $value }}MB on {{ $labels.job }}."`

			`# Database connection alerts`
			`- alert: DatabaseConnectionHigh`
			`expr: pg_stat_activity_count > 80`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "High database connections"`
			`description: "Database has {{ $value }} active connections."`

			`- name: bakery_business`
			`rules:`
			`# Training job alerts`
			`- alert: TrainingJobFailed`
			`expr: increase(training_jobs_failed_total[1h]) > 0`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Training job failed"`
			`description: "{{ $value }} training jobs have failed in the last hour."`

			`# Prediction accuracy alerts`
			`- alert: LowPredictionAccuracy`
			`expr: prediction_accuracy < 0.7`
			`for: 15m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Low prediction accuracy"`
			`description: "Prediction accuracy is {{ $value }} for tenant {{ $labels.tenant_id }}."`

			`# API rate limit alerts`
			`- alert: APIRateLimitHit`
			`expr: increase(rate_limit_hits_total[5m]) > 10`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "API rate limit hit frequently"`
			`description: "Rate limit has been hit {{ $value }} times in 5 minutes."`