# infrastructure/monitoring/prometheus/rules/alert-system-rules.yml
# Prometheus alerting rules for the Bakery Alert and Recommendation System

groups:
  - name: alert_system_health
    rules:
      # System component health alerts
      - alert: AlertSystemComponentDown
        expr: alert_system_component_health == 0
        for: 2m
        labels:
          severity: critical
          service: "{{ $labels.service }}"
          component: "{{ $labels.component }}"
        annotations:
          summary: "Alert system component {{ $labels.component }} is unhealthy"
          description: "Component {{ $labels.component }} in service {{ $labels.service }} has been unhealthy for more than 2 minutes."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#component-health"

      # Connection health alerts
      - alert: RabbitMQConnectionDown
        expr: alert_rabbitmq_connection_status == 0
        for: 1m
        labels:
          severity: critical
          service: "{{ $labels.service }}"
        annotations:
          summary: "RabbitMQ connection down for {{ $labels.service }}"
          description: "Service {{ $labels.service }} has lost connection to RabbitMQ for more than 1 minute."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#rabbitmq-connection"

      - alert: RedisConnectionDown
        expr: alert_redis_connection_status == 0
        for: 1m
        labels:
          severity: critical
          service: "{{ $labels.service }}"
        annotations:
          summary: "Redis connection down for {{ $labels.service }}"
          description: "Service {{ $labels.service }} has lost connection to Redis for more than 1 minute."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#redis-connection"

      # Leader election issues
      - alert: NoSchedulerLeader
        expr: sum(alert_scheduler_leader_status) == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "No scheduler leader elected"
          description: "No service has been elected as scheduler leader for more than 5 minutes. Scheduled checks may not be running."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#leader-election"

  - name: alert_system_performance
    rules:
      # High error rates
      - alert: HighAlertProcessingErrorRate
        expr: rate(alert_processing_errors_total[5m]) > 0.1
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High alert processing error rate"
          description: "Alert processing error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-errors"

      - alert: HighNotificationDeliveryFailureRate
        expr: rate(alert_delivery_failures_total[5m]) / rate(alert_notifications_sent_total[5m]) > 0.05
        for: 3m
        labels:
          severity: warning
          channel: "{{ $labels.channel }}"
        annotations:
          summary: "High notification delivery failure rate for {{ $labels.channel }}"
          description: "Notification delivery failure rate for {{ $labels.channel }} is {{ $value | humanizePercentage }} over the last 5 minutes."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#delivery-failures"

      # Processing latency
      - alert: HighAlertProcessingLatency
        expr: histogram_quantile(0.95, rate(alert_processing_duration_seconds_bucket[5m])) > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High alert processing latency"
          description: "95th percentile alert processing latency is {{ $value }}s, exceeding 5s threshold."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-latency"

      # SSE connection issues
      - alert: TooManySSEConnections
        expr: sum(alert_sse_active_connections) > 1000
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Too many active SSE connections"
          description: "Number of active SSE connections ({{ $value }}) exceeds 1000. This may impact performance."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-connections"

      - alert: SSEConnectionErrors
        expr: rate(alert_sse_connection_errors_total[5m]) > 0.5
        for: 3m
        labels:
          severity: warning
        annotations:
          summary: "High SSE connection error rate"
          description: "SSE connection error rate is {{ $value }} errors/second over the last 5 minutes."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-errors"

  - name: alert_system_business
    rules:
      # Alert volume anomalies
      - alert: UnusuallyHighAlertVolume
        expr: rate(alert_items_published_total{item_type="alert"}[10m]) > 2
        for: 5m
        labels:
          severity: warning
          service: "{{ $labels.service }}"
        annotations:
          summary: "Unusually high alert volume from {{ $labels.service }}"
          description: "Service {{ $labels.service }} is generating alerts at {{ $value }} alerts/second, which is above normal levels."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#high-volume"

      - alert: NoAlertsGenerated
        expr: rate(alert_items_published_total[30m]) == 0
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "No alerts generated recently"
          description: "No alerts have been generated in the last 30 minutes. This may indicate a problem with detection systems."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#no-alerts"

      # Response time issues
      - alert: SlowAlertResponseTime
        expr: histogram_quantile(0.95, rate(alert_item_response_time_seconds_bucket[1h])) > 3600
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Slow alert response times"
          description: "95th percentile alert response time is {{ $value | humanizeDuration }}, exceeding 1 hour."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#response-times"

      # Critical alerts not acknowledged
      - alert: CriticalAlertsUnacknowledged
        expr: sum(alert_active_items_current{item_type="alert",severity="urgent"}) > 5
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "Multiple critical alerts unacknowledged"
          description: "{{ $value }} critical alerts remain unacknowledged for more than 10 minutes."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#critical-unacked"

  - name: alert_system_capacity
    rules:
      # Queue size monitoring
      - alert: LargeSSEMessageQueues
        expr: alert_sse_message_queue_size > 100
        for: 5m
        labels:
          severity: warning
          tenant_id: "{{ $labels.tenant_id }}"
        annotations:
          summary: "Large SSE message queue for tenant {{ $labels.tenant_id }}"
          description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages, indicating potential client issues."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-queues"

      # Database storage issues
      - alert: SlowDatabaseStorage
        expr: histogram_quantile(0.95, rate(alert_database_storage_duration_seconds_bucket[5m])) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Slow database storage for alerts"
          description: "95th percentile database storage time is {{ $value }}s, exceeding 1s threshold."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#database-storage"

  - name: alert_system_effectiveness
    rules:
      # False positive rate monitoring
      - alert: HighFalsePositiveRate
        expr: alert_false_positive_rate > 0.2
        for: 30m
        labels:
          severity: warning
          service: "{{ $labels.service }}"
          alert_type: "{{ $labels.alert_type }}"
        annotations:
          summary: "High false positive rate for {{ $labels.alert_type }}"
          description: "False positive rate for {{ $labels.alert_type }} in {{ $labels.service }} is {{ $value | humanizePercentage }}."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#false-positives"

      # Low recommendation adoption
      - alert: LowRecommendationAdoption
        expr: rate(alert_recommendations_implemented_total[24h]) / rate(alert_items_published_total{item_type="recommendation"}[24h]) < 0.1
        for: 1h
        labels:
          severity: info
          service: "{{ $labels.service }}"
        annotations:
          summary: "Low recommendation adoption rate"
          description: "Recommendation adoption rate for {{ $labels.service }} is {{ $value | humanizePercentage }} over the last 24 hours."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#recommendation-adoption"

# Additional alerting rules for specific scenarios
  - name: alert_system_critical_scenarios
    rules:
      # Complete system failure
      - alert: AlertSystemDown
        expr: up{job=~"alert-processor|notification-service"} == 0
        for: 1m
        labels:
          severity: critical
          service: "{{ $labels.job }}"
        annotations:
          summary: "Alert system service {{ $labels.job }} is down"
          description: "Critical alert system service {{ $labels.job }} has been down for more than 1 minute."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#service-down"

      # Data loss prevention
      - alert: AlertDataNotPersisted
        expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_database_storage_duration_seconds_count[5m]) == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Alert data not being persisted to database"
          description: "Alerts are being processed but not stored in database, potential data loss."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#data-persistence"

      # Notification blackhole
      - alert: NotificationsNotDelivered
        expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_notifications_sent_total[5m]) == 0
        for: 3m
        labels:
          severity: critical
        annotations:
          summary: "Notifications not being delivered"
          description: "Alerts are being processed but no notifications are being sent."
          runbook_url: "https://docs.bakery.local/runbooks/alert-system#notification-delivery"