# ================================================================ # Monitoring Configuration: infrastructure/monitoring/prometheus/forecasting-service.yml # ================================================================ groups: - name: forecasting-service rules: - alert: ForecastingServiceDown expr: up{job="forecasting-service"} == 0 for: 1m labels: severity: critical annotations: summary: "Forecasting service is down" description: "Forecasting service has been down for more than 1 minute" - alert: HighForecastingLatency expr: histogram_quantile(0.95, forecast_processing_time_seconds) > 10 for: 5m labels: severity: warning annotations: summary: "High forecasting latency" description: "95th percentile forecasting latency is {{ $value }}s" - alert: ForecastingErrorRate expr: rate(forecasting_errors_total[5m]) > 0.1 for: 5m labels: severity: critical annotations: summary: "High forecasting error rate" description: "Forecasting error rate is {{ $value }} errors/sec" - alert: LowModelAccuracy expr: avg(model_accuracy_score) < 0.7 for: 10m labels: severity: warning annotations: summary: "Low model accuracy detected" description: "Average model accuracy is {{ $value }}"