---
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-alert-rules
  namespace: monitoring
data:
  alert-rules.yml: |
    groups:
    # Basic Infrastructure Alerts
    - name: bakery_services
      interval: 30s
      rules:
      - alert: ServiceDown
        expr: up{job="bakery-services"} == 0
        for: 2m
        labels:
          severity: critical
          component: infrastructure
        annotations:
          summary: "Service {{ $labels.service }} is down"
          description: "Service {{ $labels.service }} in namespace {{ $labels.namespace }} has been down for more than 2 minutes."
          runbook_url: "https://runbooks.bakery-ia.local/ServiceDown"

      - alert: HighErrorRate
        expr: |
          (
            sum(rate(http_requests_total{status_code=~"5..", job="bakery-services"}[5m])) by (service)
            /
            sum(rate(http_requests_total{job="bakery-services"}[5m])) by (service)
          ) > 0.10
        for: 5m
        labels:
          severity: critical
          component: application
        annotations:
          summary: "High error rate on {{ $labels.service }}"
          description: "Service {{ $labels.service }} has error rate above 10% (current: {{ $value | humanizePercentage }})."
          runbook_url: "https://runbooks.bakery-ia.local/HighErrorRate"

      - alert: HighResponseTime
        expr: |
          histogram_quantile(0.95,
            sum(rate(http_request_duration_seconds_bucket{job="bakery-services"}[5m])) by (service, le)
          ) > 1
        for: 5m
        labels:
          severity: warning
          component: performance
        annotations:
          summary: "High response time on {{ $labels.service }}"
          description: "Service {{ $labels.service }} P95 latency is above 1 second (current: {{ $value }}s)."
          runbook_url: "https://runbooks.bakery-ia.local/HighResponseTime"

      - alert: HighMemoryUsage
        expr: |
          container_memory_usage_bytes{namespace="bakery-ia", container!=""} > 500000000
        for: 5m
        labels:
          severity: warning
          component: infrastructure
        annotations:
          summary: "High memory usage in {{ $labels.pod }}"
          description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using more than 500MB of memory (current: {{ $value | humanize }}B)."
          runbook_url: "https://runbooks.bakery-ia.local/HighMemoryUsage"

      - alert: DatabaseConnectionHigh
        expr: |
          pg_stat_database_numbackends{datname="bakery"} > 80
        for: 5m
        labels:
          severity: warning
          component: database
        annotations:
          summary: "High database connection count"
          description: "Database has more than 80 active connections (current: {{ $value }})."
          runbook_url: "https://runbooks.bakery-ia.local/DatabaseConnectionHigh"

    # Business Logic Alerts
    - name: bakery_business
      interval: 30s
      rules:
      - alert: TrainingJobFailed
        expr: |
          increase(training_job_failures_total[1h]) > 0
        for: 5m
        labels:
          severity: warning
          component: ml-training
        annotations:
          summary: "Training job failures detected"
          description: "{{ $value }} training job(s) failed in the last hour."
          runbook_url: "https://runbooks.bakery-ia.local/TrainingJobFailed"

      - alert: LowPredictionAccuracy
        expr: |
          prediction_model_accuracy < 0.70
        for: 15m
        labels:
          severity: warning
          component: ml-inference
        annotations:
          summary: "Model prediction accuracy is low"
          description: "Model {{ $labels.model_name }} accuracy is below 70% (current: {{ $value | humanizePercentage }})."
          runbook_url: "https://runbooks.bakery-ia.local/LowPredictionAccuracy"

      - alert: APIRateLimitHit
        expr: |
          increase(rate_limit_hits_total[5m]) > 10
        for: 5m
        labels:
          severity: info
          component: api-gateway
        annotations:
          summary: "API rate limits being hit frequently"
          description: "Rate limits hit {{ $value }} times in the last 5 minutes."
          runbook_url: "https://runbooks.bakery-ia.local/APIRateLimitHit"

    # Alert System Health
    - name: alert_system_health
      interval: 30s
      rules:
      - alert: AlertSystemComponentDown
        expr: |
          alert_system_component_health{component=~"processor|notifier|scheduler"} == 0
        for: 2m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "Alert system component {{ $labels.component }} is unhealthy"
          description: "Component {{ $labels.component }} has been unhealthy for more than 2 minutes."
          runbook_url: "https://runbooks.bakery-ia.local/AlertSystemComponentDown"

      - alert: RabbitMQConnectionDown
        expr: |
          rabbitmq_up == 0
        for: 1m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "RabbitMQ connection is down"
          description: "Alert system has lost connection to RabbitMQ message queue."
          runbook_url: "https://runbooks.bakery-ia.local/RabbitMQConnectionDown"

      - alert: RedisConnectionDown
        expr: |
          redis_up == 0
        for: 1m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "Redis connection is down"
          description: "Alert system has lost connection to Redis cache."
          runbook_url: "https://runbooks.bakery-ia.local/RedisConnectionDown"

      - alert: NoSchedulerLeader
        expr: |
          sum(alert_system_scheduler_leader) == 0
        for: 5m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "No alert scheduler leader elected"
          description: "No scheduler instance has been elected as leader for 5 minutes."
          runbook_url: "https://runbooks.bakery-ia.local/NoSchedulerLeader"

    # Alert System Performance
    - name: alert_system_performance
      interval: 30s
      rules:
      - alert: HighAlertProcessingErrorRate
        expr: |
          (
            sum(rate(alert_processing_errors_total[2m]))
            /
            sum(rate(alerts_processed_total[2m]))
          ) > 0.10
        for: 2m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "High alert processing error rate"
          description: "Alert processing error rate is above 10% (current: {{ $value | humanizePercentage }})."
          runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingErrorRate"

      - alert: HighNotificationDeliveryFailureRate
        expr: |
          (
            sum(rate(notification_delivery_failures_total[3m]))
            /
            sum(rate(notifications_sent_total[3m]))
          ) > 0.05
        for: 3m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "High notification delivery failure rate"
          description: "Notification delivery failure rate is above 5% (current: {{ $value | humanizePercentage }})."
          runbook_url: "https://runbooks.bakery-ia.local/HighNotificationDeliveryFailureRate"

      - alert: HighAlertProcessingLatency
        expr: |
          histogram_quantile(0.95,
            sum(rate(alert_processing_duration_seconds_bucket[5m])) by (le)
          ) > 5
        for: 5m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "High alert processing latency"
          description: "P95 alert processing latency is above 5 seconds (current: {{ $value }}s)."
          runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingLatency"

      - alert: TooManySSEConnections
        expr: |
          sse_active_connections > 1000
        for: 2m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "Too many active SSE connections"
          description: "More than 1000 active SSE connections (current: {{ $value }})."
          runbook_url: "https://runbooks.bakery-ia.local/TooManySSEConnections"

      - alert: SSEConnectionErrors
        expr: |
          rate(sse_connection_errors_total[3m]) > 0.5
        for: 3m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "High rate of SSE connection errors"
          description: "SSE connection error rate is {{ $value }} errors/sec."
          runbook_url: "https://runbooks.bakery-ia.local/SSEConnectionErrors"

    # Alert System Business Logic
    - name: alert_system_business
      interval: 30s
      rules:
      - alert: UnusuallyHighAlertVolume
        expr: |
          rate(alerts_generated_total[5m]) > 2
        for: 5m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "Unusually high alert generation volume"
          description: "More than 2 alerts per second being generated (current: {{ $value }}/sec)."
          runbook_url: "https://runbooks.bakery-ia.local/UnusuallyHighAlertVolume"

      - alert: NoAlertsGenerated
        expr: |
          rate(alerts_generated_total[30m]) == 0
        for: 15m
        labels:
          severity: info
          component: alert-system
        annotations:
          summary: "No alerts generated recently"
          description: "No alerts have been generated in the last 30 minutes. This might indicate a problem with alert detection."
          runbook_url: "https://runbooks.bakery-ia.local/NoAlertsGenerated"

      - alert: SlowAlertResponseTime
        expr: |
          histogram_quantile(0.95,
            sum(rate(alert_response_time_seconds_bucket[10m])) by (le)
          ) > 3600
        for: 10m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "Slow alert response times"
          description: "P95 alert response time is above 1 hour (current: {{ $value | humanizeDuration }})."
          runbook_url: "https://runbooks.bakery-ia.local/SlowAlertResponseTime"

      - alert: CriticalAlertsUnacknowledged
        expr: |
          sum(alerts_unacknowledged{severity="critical"}) > 5
        for: 10m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "Multiple critical alerts unacknowledged"
          description: "{{ $value }} critical alerts have not been acknowledged for 10+ minutes."
          runbook_url: "https://runbooks.bakery-ia.local/CriticalAlertsUnacknowledged"

    # Alert System Capacity
    - name: alert_system_capacity
      interval: 30s
      rules:
      - alert: LargeSSEMessageQueues
        expr: |
          sse_message_queue_size > 100
        for: 5m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "Large SSE message queues detected"
          description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages queued."
          runbook_url: "https://runbooks.bakery-ia.local/LargeSSEMessageQueues"

      - alert: SlowDatabaseStorage
        expr: |
          histogram_quantile(0.95,
            sum(rate(alert_storage_duration_seconds_bucket[5m])) by (le)
          ) > 1
        for: 5m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "Slow alert database storage"
          description: "P95 alert storage latency is above 1 second (current: {{ $value }}s)."
          runbook_url: "https://runbooks.bakery-ia.local/SlowDatabaseStorage"

    # Alert System Critical Scenarios
    - name: alert_system_critical
      interval: 15s
      rules:
      - alert: AlertSystemDown
        expr: |
          up{service=~"alert-processor|notification-service"} == 0
        for: 1m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "Alert system is completely down"
          description: "Core alert system service {{ $labels.service }} is down."
          runbook_url: "https://runbooks.bakery-ia.local/AlertSystemDown"

      - alert: AlertDataNotPersisted
        expr: |
          (
            sum(rate(alerts_processed_total[2m]))
            -
            sum(rate(alerts_stored_total[2m]))
          ) > 0
        for: 2m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "Alerts not being persisted to database"
          description: "Alerts are being processed but not stored in the database."
          runbook_url: "https://runbooks.bakery-ia.local/AlertDataNotPersisted"

      - alert: NotificationsNotDelivered
        expr: |
          (
            sum(rate(alerts_processed_total[3m]))
            -
            sum(rate(notifications_sent_total[3m]))
          ) > 0
        for: 3m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "Notifications not being delivered"
          description: "Alerts are being processed but notifications are not being sent."
          runbook_url: "https://runbooks.bakery-ia.local/NotificationsNotDelivered"

    # Monitoring System Self-Monitoring
    - name: monitoring_health
      interval: 30s
      rules:
      - alert: PrometheusDown
        expr: up{job="prometheus"} == 0
        for: 5m
        labels:
          severity: critical
          component: monitoring
        annotations:
          summary: "Prometheus is down"
          description: "Prometheus monitoring system is not responding."
          runbook_url: "https://runbooks.bakery-ia.local/PrometheusDown"

      - alert: AlertManagerDown
        expr: up{job="alertmanager"} == 0
        for: 2m
        labels:
          severity: critical
          component: monitoring
        annotations:
          summary: "AlertManager is down"
          description: "AlertManager is not responding. Alerts will not be routed."
          runbook_url: "https://runbooks.bakery-ia.local/AlertManagerDown"

      - alert: PrometheusStorageFull
        expr: |
          (
            prometheus_tsdb_storage_blocks_bytes
            /
            (prometheus_tsdb_storage_blocks_bytes + prometheus_tsdb_wal_size_bytes)
          ) > 0.90
        for: 10m
        labels:
          severity: warning
          component: monitoring
        annotations:
          summary: "Prometheus storage almost full"
          description: "Prometheus storage is {{ $value | humanizePercentage }} full."
          runbook_url: "https://runbooks.bakery-ia.local/PrometheusStorageFull"

      - alert: PrometheusScrapeErrors
        expr: |
          rate(prometheus_target_scrapes_exceeded_sample_limit_total[5m]) > 0
        for: 5m
        labels:
          severity: warning
          component: monitoring
        annotations:
          summary: "Prometheus scrape errors detected"
          description: "Prometheus is experiencing scrape errors for target {{ $labels.job }}."
          runbook_url: "https://runbooks.bakery-ia.local/PrometheusScrapeErrors"