bakery-ia/infrastructure/kubernetes/base/components/monitoring/alert-rules.yaml

---
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-alert-rules
  namespace: monitoring
data:
  alert-rules.yml: |
    groups:
    # Basic Infrastructure Alerts
    - name: bakery_services
      interval: 30s
      rules:
      - alert: ServiceDown
        expr: up{job="bakery-services"} == 0
        for: 2m
        labels:
          severity: critical
          component: infrastructure
        annotations:
          summary: "Service {{ $labels.service }} is down"
          description: "Service {{ $labels.service }} in namespace {{ $labels.namespace }} has been down for more than 2 minutes."
          runbook_url: "https://runbooks.bakery-ia.local/ServiceDown"

      - alert: HighErrorRate
        expr: |
          (
            sum(rate(http_requests_total{status_code=~"5..", job="bakery-services"}[5m])) by (service)
            /
            sum(rate(http_requests_total{job="bakery-services"}[5m])) by (service)
          ) > 0.10
        for: 5m
        labels:
          severity: critical
          component: application
        annotations:
          summary: "High error rate on {{ $labels.service }}"
          description: "Service {{ $labels.service }} has error rate above 10% (current: {{ $value | humanizePercentage }})."
          runbook_url: "https://runbooks.bakery-ia.local/HighErrorRate"

      - alert: HighResponseTime
        expr: |
          histogram_quantile(0.95,
            sum(rate(http_request_duration_seconds_bucket{job="bakery-services"}[5m])) by (service, le)
          ) > 1
        for: 5m
        labels:
          severity: warning
          component: performance
        annotations:
          summary: "High response time on {{ $labels.service }}"
          description: "Service {{ $labels.service }} P95 latency is above 1 second (current: {{ $value }}s)."
          runbook_url: "https://runbooks.bakery-ia.local/HighResponseTime"

      - alert: HighMemoryUsage
        expr: |
          container_memory_usage_bytes{namespace="bakery-ia", container!=""} > 500000000
        for: 5m
        labels:
          severity: warning
          component: infrastructure
        annotations:
          summary: "High memory usage in {{ $labels.pod }}"
          description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using more than 500MB of memory (current: {{ $value | humanize }}B)."
          runbook_url: "https://runbooks.bakery-ia.local/HighMemoryUsage"

      - alert: DatabaseConnectionHigh
        expr: |
          pg_stat_database_numbackends{datname="bakery"} > 80
        for: 5m
        labels:
          severity: warning
          component: database
        annotations:
          summary: "High database connection count"
          description: "Database has more than 80 active connections (current: {{ $value }})."
          runbook_url: "https://runbooks.bakery-ia.local/DatabaseConnectionHigh"

    # Business Logic Alerts
    - name: bakery_business
      interval: 30s
      rules:
      - alert: TrainingJobFailed
        expr: |
          increase(training_job_failures_total[1h]) > 0
        for: 5m
        labels:
          severity: warning
          component: ml-training
        annotations:
          summary: "Training job failures detected"
          description: "{{ $value }} training job(s) failed in the last hour."
          runbook_url: "https://runbooks.bakery-ia.local/TrainingJobFailed"

      - alert: LowPredictionAccuracy
        expr: |
          prediction_model_accuracy < 0.70
        for: 15m
        labels:
          severity: warning
          component: ml-inference
        annotations:
          summary: "Model prediction accuracy is low"
          description: "Model {{ $labels.model_name }} accuracy is below 70% (current: {{ $value | humanizePercentage }})."
          runbook_url: "https://runbooks.bakery-ia.local/LowPredictionAccuracy"

      - alert: APIRateLimitHit
        expr: |
          increase(rate_limit_hits_total[5m]) > 10
        for: 5m
        labels:
          severity: info
          component: api-gateway
        annotations:
          summary: "API rate limits being hit frequently"
          description: "Rate limits hit {{ $value }} times in the last 5 minutes."
          runbook_url: "https://runbooks.bakery-ia.local/APIRateLimitHit"

    # Alert System Health
    - name: alert_system_health
      interval: 30s
      rules:
      - alert: AlertSystemComponentDown
        expr: |
          alert_system_component_health{component=~"processor|notifier|scheduler"} == 0
        for: 2m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "Alert system component {{ $labels.component }} is unhealthy"
          description: "Component {{ $labels.component }} has been unhealthy for more than 2 minutes."
          runbook_url: "https://runbooks.bakery-ia.local/AlertSystemComponentDown"

      - alert: RabbitMQConnectionDown
        expr: |
          rabbitmq_up == 0
        for: 1m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "RabbitMQ connection is down"
          description: "Alert system has lost connection to RabbitMQ message queue."
          runbook_url: "https://runbooks.bakery-ia.local/RabbitMQConnectionDown"

      - alert: RedisConnectionDown
        expr: |
          redis_up == 0
        for: 1m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "Redis connection is down"
          description: "Alert system has lost connection to Redis cache."
          runbook_url: "https://runbooks.bakery-ia.local/RedisConnectionDown"

      - alert: NoSchedulerLeader
        expr: |
          sum(alert_system_scheduler_leader) == 0
        for: 5m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "No alert scheduler leader elected"
          description: "No scheduler instance has been elected as leader for 5 minutes."
          runbook_url: "https://runbooks.bakery-ia.local/NoSchedulerLeader"

    # Alert System Performance
    - name: alert_system_performance
      interval: 30s
      rules:
      - alert: HighAlertProcessingErrorRate
        expr: |
          (
            sum(rate(alert_processing_errors_total[2m]))
            /
            sum(rate(alerts_processed_total[2m]))
          ) > 0.10
        for: 2m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "High alert processing error rate"
          description: "Alert processing error rate is above 10% (current: {{ $value | humanizePercentage }})."
          runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingErrorRate"

      - alert: HighNotificationDeliveryFailureRate
        expr: |
          (
            sum(rate(notification_delivery_failures_total[3m]))
            /
            sum(rate(notifications_sent_total[3m]))
          ) > 0.05
        for: 3m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "High notification delivery failure rate"
          description: "Notification delivery failure rate is above 5% (current: {{ $value | humanizePercentage }})."
          runbook_url: "https://runbooks.bakery-ia.local/HighNotificationDeliveryFailureRate"

      - alert: HighAlertProcessingLatency
        expr: |
          histogram_quantile(0.95,
            sum(rate(alert_processing_duration_seconds_bucket[5m])) by (le)
          ) > 5
        for: 5m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "High alert processing latency"
          description: "P95 alert processing latency is above 5 seconds (current: {{ $value }}s)."
          runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingLatency"

      - alert: TooManySSEConnections
        expr: |
          sse_active_connections > 1000
        for: 2m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "Too many active SSE connections"
          description: "More than 1000 active SSE connections (current: {{ $value }})."
          runbook_url: "https://runbooks.bakery-ia.local/TooManySSEConnections"

      - alert: SSEConnectionErrors
        expr: |
          rate(sse_connection_errors_total[3m]) > 0.5
        for: 3m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "High rate of SSE connection errors"
          description: "SSE connection error rate is {{ $value }} errors/sec."
          runbook_url: "https://runbooks.bakery-ia.local/SSEConnectionErrors"

    # Alert System Business Logic
    - name: alert_system_business
      interval: 30s
      rules:
      - alert: UnusuallyHighAlertVolume
        expr: |
          rate(alerts_generated_total[5m]) > 2
        for: 5m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "Unusually high alert generation volume"
          description: "More than 2 alerts per second being generated (current: {{ $value }}/sec)."
          runbook_url: "https://runbooks.bakery-ia.local/UnusuallyHighAlertVolume"

      - alert: NoAlertsGenerated
        expr: |
          rate(alerts_generated_total[30m]) == 0
        for: 15m
        labels:
          severity: info
          component: alert-system
        annotations:
          summary: "No alerts generated recently"
          description: "No alerts have been generated in the last 30 minutes. This might indicate a problem with alert detection."
          runbook_url: "https://runbooks.bakery-ia.local/NoAlertsGenerated"

      - alert: SlowAlertResponseTime
        expr: |
          histogram_quantile(0.95,
            sum(rate(alert_response_time_seconds_bucket[10m])) by (le)
          ) > 3600
        for: 10m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "Slow alert response times"
          description: "P95 alert response time is above 1 hour (current: {{ $value | humanizeDuration }})."
          runbook_url: "https://runbooks.bakery-ia.local/SlowAlertResponseTime"

      - alert: CriticalAlertsUnacknowledged
        expr: |
          sum(alerts_unacknowledged{severity="critical"}) > 5
        for: 10m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "Multiple critical alerts unacknowledged"
          description: "{{ $value }} critical alerts have not been acknowledged for 10+ minutes."
          runbook_url: "https://runbooks.bakery-ia.local/CriticalAlertsUnacknowledged"

    # Alert System Capacity
    - name: alert_system_capacity
      interval: 30s
      rules:
      - alert: LargeSSEMessageQueues
        expr: |
          sse_message_queue_size > 100
        for: 5m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "Large SSE message queues detected"
          description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages queued."
          runbook_url: "https://runbooks.bakery-ia.local/LargeSSEMessageQueues"

      - alert: SlowDatabaseStorage
        expr: |
          histogram_quantile(0.95,
            sum(rate(alert_storage_duration_seconds_bucket[5m])) by (le)
          ) > 1
        for: 5m
        labels:
          severity: warning
          component: alert-system
        annotations:
          summary: "Slow alert database storage"
          description: "P95 alert storage latency is above 1 second (current: {{ $value }}s)."
          runbook_url: "https://runbooks.bakery-ia.local/SlowDatabaseStorage"

    # Alert System Critical Scenarios
    - name: alert_system_critical
      interval: 15s
      rules:
      - alert: AlertSystemDown
        expr: |
          up{service=~"alert-processor|notification-service"} == 0
        for: 1m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "Alert system is completely down"
          description: "Core alert system service {{ $labels.service }} is down."
          runbook_url: "https://runbooks.bakery-ia.local/AlertSystemDown"

      - alert: AlertDataNotPersisted
        expr: |
          (
            sum(rate(alerts_processed_total[2m]))
            -
            sum(rate(alerts_stored_total[2m]))
          ) > 0
        for: 2m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "Alerts not being persisted to database"
          description: "Alerts are being processed but not stored in the database."
          runbook_url: "https://runbooks.bakery-ia.local/AlertDataNotPersisted"

      - alert: NotificationsNotDelivered
        expr: |
          (
            sum(rate(alerts_processed_total[3m]))
            -
            sum(rate(notifications_sent_total[3m]))
          ) > 0
        for: 3m
        labels:
          severity: critical
          component: alert-system
        annotations:
          summary: "Notifications not being delivered"
          description: "Alerts are being processed but notifications are not being sent."
          runbook_url: "https://runbooks.bakery-ia.local/NotificationsNotDelivered"

    # Monitoring System Self-Monitoring
    - name: monitoring_health
      interval: 30s
      rules:
      - alert: PrometheusDown
        expr: up{job="prometheus"} == 0
        for: 5m
        labels:
          severity: critical
          component: monitoring
        annotations:
          summary: "Prometheus is down"
          description: "Prometheus monitoring system is not responding."
          runbook_url: "https://runbooks.bakery-ia.local/PrometheusDown"

      - alert: AlertManagerDown
        expr: up{job="alertmanager"} == 0
        for: 2m
        labels:
          severity: critical
          component: monitoring
        annotations:
          summary: "AlertManager is down"
          description: "AlertManager is not responding. Alerts will not be routed."
          runbook_url: "https://runbooks.bakery-ia.local/AlertManagerDown"

      - alert: PrometheusStorageFull
        expr: |
          (
            prometheus_tsdb_storage_blocks_bytes
            /
            (prometheus_tsdb_storage_blocks_bytes + prometheus_tsdb_wal_size_bytes)
          ) > 0.90
        for: 10m
        labels:
          severity: warning
          component: monitoring
        annotations:
          summary: "Prometheus storage almost full"
          description: "Prometheus storage is {{ $value | humanizePercentage }} full."
          runbook_url: "https://runbooks.bakery-ia.local/PrometheusStorageFull"

      - alert: PrometheusScrapeErrors
        expr: |
          rate(prometheus_target_scrapes_exceeded_sample_limit_total[5m]) > 0
        for: 5m
        labels:
          severity: warning
          component: monitoring
        annotations:
          summary: "Prometheus scrape errors detected"
          description: "Prometheus is experiencing scrape errors for target {{ $labels.job }}."
          runbook_url: "https://runbooks.bakery-ia.local/PrometheusScrapeErrors"
Improve monitoring for prod 2026-01-07 19:12:35 +01:00			`---`
			`apiVersion: v1`
			`kind: ConfigMap`
			`metadata:`
			`name: prometheus-alert-rules`
			`namespace: monitoring`
			`data:`
			`alert-rules.yml: \|`
			`groups:`
			`# Basic Infrastructure Alerts`
			`- name: bakery_services`
			`interval: 30s`
			`rules:`
			`- alert: ServiceDown`
			`expr: up{job="bakery-services"} == 0`
			`for: 2m`
			`labels:`
			`severity: critical`
			`component: infrastructure`
			`annotations:`
			`summary: "Service {{ $labels.service }} is down"`
			`description: "Service {{ $labels.service }} in namespace {{ $labels.namespace }} has been down for more than 2 minutes."`
			`runbook_url: "https://runbooks.bakery-ia.local/ServiceDown"`

			`- alert: HighErrorRate`
			`expr: \|`
			`(`
			`sum(rate(http_requests_total{status_code=~"5..", job="bakery-services"}[5m])) by (service)`
			`/`
			`sum(rate(http_requests_total{job="bakery-services"}[5m])) by (service)`
			`) > 0.10`
			`for: 5m`
			`labels:`
			`severity: critical`
			`component: application`
			`annotations:`
			`summary: "High error rate on {{ $labels.service }}"`
			`description: "Service {{ $labels.service }} has error rate above 10% (current: {{ $value \| humanizePercentage }})."`
			`runbook_url: "https://runbooks.bakery-ia.local/HighErrorRate"`

			`- alert: HighResponseTime`
			`expr: \|`
			`histogram_quantile(0.95,`
			`sum(rate(http_request_duration_seconds_bucket{job="bakery-services"}[5m])) by (service, le)`
			`) > 1`
			`for: 5m`
			`labels:`
			`severity: warning`
			`component: performance`
			`annotations:`
			`summary: "High response time on {{ $labels.service }}"`
			`description: "Service {{ $labels.service }} P95 latency is above 1 second (current: {{ $value }}s)."`
			`runbook_url: "https://runbooks.bakery-ia.local/HighResponseTime"`

			`- alert: HighMemoryUsage`
			`expr: \|`
			`container_memory_usage_bytes{namespace="bakery-ia", container!=""} > 500000000`
			`for: 5m`
			`labels:`
			`severity: warning`
			`component: infrastructure`
			`annotations:`
			`summary: "High memory usage in {{ $labels.pod }}"`
			`description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using more than 500MB of memory (current: {{ $value \| humanize }}B)."`
			`runbook_url: "https://runbooks.bakery-ia.local/HighMemoryUsage"`

			`- alert: DatabaseConnectionHigh`
			`expr: \|`
			`pg_stat_database_numbackends{datname="bakery"} > 80`
			`for: 5m`
			`labels:`
			`severity: warning`
			`component: database`
			`annotations:`
			`summary: "High database connection count"`
			`description: "Database has more than 80 active connections (current: {{ $value }})."`
			`runbook_url: "https://runbooks.bakery-ia.local/DatabaseConnectionHigh"`

			`# Business Logic Alerts`
			`- name: bakery_business`
			`interval: 30s`
			`rules:`
			`- alert: TrainingJobFailed`
			`expr: \|`
			`increase(training_job_failures_total[1h]) > 0`
			`for: 5m`
			`labels:`
			`severity: warning`
			`component: ml-training`
			`annotations:`
			`summary: "Training job failures detected"`
			`description: "{{ $value }} training job(s) failed in the last hour."`
			`runbook_url: "https://runbooks.bakery-ia.local/TrainingJobFailed"`

			`- alert: LowPredictionAccuracy`
			`expr: \|`
			`prediction_model_accuracy < 0.70`
			`for: 15m`
			`labels:`
			`severity: warning`
			`component: ml-inference`
			`annotations:`
			`summary: "Model prediction accuracy is low"`
			`description: "Model {{ $labels.model_name }} accuracy is below 70% (current: {{ $value \| humanizePercentage }})."`
			`runbook_url: "https://runbooks.bakery-ia.local/LowPredictionAccuracy"`

			`- alert: APIRateLimitHit`
			`expr: \|`
			`increase(rate_limit_hits_total[5m]) > 10`
			`for: 5m`
			`labels:`
			`severity: info`
			`component: api-gateway`
			`annotations:`
			`summary: "API rate limits being hit frequently"`
			`description: "Rate limits hit {{ $value }} times in the last 5 minutes."`
			`runbook_url: "https://runbooks.bakery-ia.local/APIRateLimitHit"`

			`# Alert System Health`
			`- name: alert_system_health`
			`interval: 30s`
			`rules:`
			`- alert: AlertSystemComponentDown`
			`expr: \|`
			`alert_system_component_health{component=~"processor\|notifier\|scheduler"} == 0`
			`for: 2m`
			`labels:`
			`severity: critical`
			`component: alert-system`
			`annotations:`
			`summary: "Alert system component {{ $labels.component }} is unhealthy"`
			`description: "Component {{ $labels.component }} has been unhealthy for more than 2 minutes."`
			`runbook_url: "https://runbooks.bakery-ia.local/AlertSystemComponentDown"`

			`- alert: RabbitMQConnectionDown`
			`expr: \|`
			`rabbitmq_up == 0`
			`for: 1m`
			`labels:`
			`severity: critical`
			`component: alert-system`
			`annotations:`
			`summary: "RabbitMQ connection is down"`
			`description: "Alert system has lost connection to RabbitMQ message queue."`
			`runbook_url: "https://runbooks.bakery-ia.local/RabbitMQConnectionDown"`

			`- alert: RedisConnectionDown`
			`expr: \|`
			`redis_up == 0`
			`for: 1m`
			`labels:`
			`severity: critical`
			`component: alert-system`
			`annotations:`
			`summary: "Redis connection is down"`
			`description: "Alert system has lost connection to Redis cache."`
			`runbook_url: "https://runbooks.bakery-ia.local/RedisConnectionDown"`

			`- alert: NoSchedulerLeader`
			`expr: \|`
			`sum(alert_system_scheduler_leader) == 0`
			`for: 5m`
			`labels:`
			`severity: warning`
			`component: alert-system`
			`annotations:`
			`summary: "No alert scheduler leader elected"`
			`description: "No scheduler instance has been elected as leader for 5 minutes."`
			`runbook_url: "https://runbooks.bakery-ia.local/NoSchedulerLeader"`

			`# Alert System Performance`
			`- name: alert_system_performance`
			`interval: 30s`
			`rules:`
			`- alert: HighAlertProcessingErrorRate`
			`expr: \|`
			`(`
			`sum(rate(alert_processing_errors_total[2m]))`
			`/`
			`sum(rate(alerts_processed_total[2m]))`
			`) > 0.10`
			`for: 2m`
			`labels:`
			`severity: critical`
			`component: alert-system`
			`annotations:`
			`summary: "High alert processing error rate"`
			`description: "Alert processing error rate is above 10% (current: {{ $value \| humanizePercentage }})."`
			`runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingErrorRate"`

			`- alert: HighNotificationDeliveryFailureRate`
			`expr: \|`
			`(`
			`sum(rate(notification_delivery_failures_total[3m]))`
			`/`
			`sum(rate(notifications_sent_total[3m]))`
			`) > 0.05`
			`for: 3m`
			`labels:`
			`severity: warning`
			`component: alert-system`
			`annotations:`
			`summary: "High notification delivery failure rate"`
			`description: "Notification delivery failure rate is above 5% (current: {{ $value \| humanizePercentage }})."`
			`runbook_url: "https://runbooks.bakery-ia.local/HighNotificationDeliveryFailureRate"`

			`- alert: HighAlertProcessingLatency`
			`expr: \|`
			`histogram_quantile(0.95,`
			`sum(rate(alert_processing_duration_seconds_bucket[5m])) by (le)`
			`) > 5`
			`for: 5m`
			`labels:`
			`severity: warning`
			`component: alert-system`
			`annotations:`
			`summary: "High alert processing latency"`
			`description: "P95 alert processing latency is above 5 seconds (current: {{ $value }}s)."`
			`runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingLatency"`

			`- alert: TooManySSEConnections`
			`expr: \|`
			`sse_active_connections > 1000`
			`for: 2m`
			`labels:`
			`severity: warning`
			`component: alert-system`
			`annotations:`
			`summary: "Too many active SSE connections"`
			`description: "More than 1000 active SSE connections (current: {{ $value }})."`
			`runbook_url: "https://runbooks.bakery-ia.local/TooManySSEConnections"`

			`- alert: SSEConnectionErrors`
			`expr: \|`
			`rate(sse_connection_errors_total[3m]) > 0.5`
			`for: 3m`
			`labels:`
			`severity: warning`
			`component: alert-system`
			`annotations:`
			`summary: "High rate of SSE connection errors"`
			`description: "SSE connection error rate is {{ $value }} errors/sec."`
			`runbook_url: "https://runbooks.bakery-ia.local/SSEConnectionErrors"`

			`# Alert System Business Logic`
			`- name: alert_system_business`
			`interval: 30s`
			`rules:`
			`- alert: UnusuallyHighAlertVolume`
			`expr: \|`
			`rate(alerts_generated_total[5m]) > 2`
			`for: 5m`
			`labels:`
			`severity: warning`
			`component: alert-system`
			`annotations:`
			`summary: "Unusually high alert generation volume"`
			`description: "More than 2 alerts per second being generated (current: {{ $value }}/sec)."`
			`runbook_url: "https://runbooks.bakery-ia.local/UnusuallyHighAlertVolume"`

			`- alert: NoAlertsGenerated`
			`expr: \|`
			`rate(alerts_generated_total[30m]) == 0`
			`for: 15m`
			`labels:`
			`severity: info`
			`component: alert-system`
			`annotations:`
			`summary: "No alerts generated recently"`
			`description: "No alerts have been generated in the last 30 minutes. This might indicate a problem with alert detection."`
			`runbook_url: "https://runbooks.bakery-ia.local/NoAlertsGenerated"`

			`- alert: SlowAlertResponseTime`
			`expr: \|`
			`histogram_quantile(0.95,`
			`sum(rate(alert_response_time_seconds_bucket[10m])) by (le)`
			`) > 3600`
			`for: 10m`
			`labels:`
			`severity: warning`
			`component: alert-system`
			`annotations:`
			`summary: "Slow alert response times"`
			`description: "P95 alert response time is above 1 hour (current: {{ $value \| humanizeDuration }})."`
			`runbook_url: "https://runbooks.bakery-ia.local/SlowAlertResponseTime"`

			`- alert: CriticalAlertsUnacknowledged`
			`expr: \|`
			`sum(alerts_unacknowledged{severity="critical"}) > 5`
			`for: 10m`
			`labels:`
			`severity: warning`
			`component: alert-system`
			`annotations:`
			`summary: "Multiple critical alerts unacknowledged"`
			`description: "{{ $value }} critical alerts have not been acknowledged for 10+ minutes."`
			`runbook_url: "https://runbooks.bakery-ia.local/CriticalAlertsUnacknowledged"`

			`# Alert System Capacity`
			`- name: alert_system_capacity`
			`interval: 30s`
			`rules:`
			`- alert: LargeSSEMessageQueues`
			`expr: \|`
			`sse_message_queue_size > 100`
			`for: 5m`
			`labels:`
			`severity: warning`
			`component: alert-system`
			`annotations:`
			`summary: "Large SSE message queues detected"`
			`description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages queued."`
			`runbook_url: "https://runbooks.bakery-ia.local/LargeSSEMessageQueues"`

			`- alert: SlowDatabaseStorage`
			`expr: \|`
			`histogram_quantile(0.95,`
			`sum(rate(alert_storage_duration_seconds_bucket[5m])) by (le)`
			`) > 1`
			`for: 5m`
			`labels:`
			`severity: warning`
			`component: alert-system`
			`annotations:`
			`summary: "Slow alert database storage"`
			`description: "P95 alert storage latency is above 1 second (current: {{ $value }}s)."`
			`runbook_url: "https://runbooks.bakery-ia.local/SlowDatabaseStorage"`

			`# Alert System Critical Scenarios`
			`- name: alert_system_critical`
			`interval: 15s`
			`rules:`
			`- alert: AlertSystemDown`
			`expr: \|`
			`up{service=~"alert-processor\|notification-service"} == 0`
			`for: 1m`
			`labels:`
			`severity: critical`
			`component: alert-system`
			`annotations:`
			`summary: "Alert system is completely down"`
			`description: "Core alert system service {{ $labels.service }} is down."`
			`runbook_url: "https://runbooks.bakery-ia.local/AlertSystemDown"`

			`- alert: AlertDataNotPersisted`
			`expr: \|`
			`(`
			`sum(rate(alerts_processed_total[2m]))`
			`-`
			`sum(rate(alerts_stored_total[2m]))`
			`) > 0`
			`for: 2m`
			`labels:`
			`severity: critical`
			`component: alert-system`
			`annotations:`
			`summary: "Alerts not being persisted to database"`
			`description: "Alerts are being processed but not stored in the database."`
			`runbook_url: "https://runbooks.bakery-ia.local/AlertDataNotPersisted"`

			`- alert: NotificationsNotDelivered`
			`expr: \|`
			`(`
			`sum(rate(alerts_processed_total[3m]))`
			`-`
			`sum(rate(notifications_sent_total[3m]))`
			`) > 0`
			`for: 3m`
			`labels:`
			`severity: critical`
			`component: alert-system`
			`annotations:`
			`summary: "Notifications not being delivered"`
			`description: "Alerts are being processed but notifications are not being sent."`
			`runbook_url: "https://runbooks.bakery-ia.local/NotificationsNotDelivered"`

			`# Monitoring System Self-Monitoring`
			`- name: monitoring_health`
			`interval: 30s`
			`rules:`
			`- alert: PrometheusDown`
			`expr: up{job="prometheus"} == 0`
			`for: 5m`
			`labels:`
			`severity: critical`
			`component: monitoring`
			`annotations:`
			`summary: "Prometheus is down"`
			`description: "Prometheus monitoring system is not responding."`
			`runbook_url: "https://runbooks.bakery-ia.local/PrometheusDown"`

			`- alert: AlertManagerDown`
			`expr: up{job="alertmanager"} == 0`
			`for: 2m`
			`labels:`
			`severity: critical`
			`component: monitoring`
			`annotations:`
			`summary: "AlertManager is down"`
			`description: "AlertManager is not responding. Alerts will not be routed."`
			`runbook_url: "https://runbooks.bakery-ia.local/AlertManagerDown"`

			`- alert: PrometheusStorageFull`
			`expr: \|`
			`(`
			`prometheus_tsdb_storage_blocks_bytes`
			`/`
			`(prometheus_tsdb_storage_blocks_bytes + prometheus_tsdb_wal_size_bytes)`
			`) > 0.90`
			`for: 10m`
			`labels:`
			`severity: warning`
			`component: monitoring`
			`annotations:`
			`summary: "Prometheus storage almost full"`
			`description: "Prometheus storage is {{ $value \| humanizePercentage }} full."`
			`runbook_url: "https://runbooks.bakery-ia.local/PrometheusStorageFull"`

			`- alert: PrometheusScrapeErrors`
			`expr: \|`
			`rate(prometheus_target_scrapes_exceeded_sample_limit_total[5m]) > 0`
			`for: 5m`
			`labels:`
			`severity: warning`
			`component: monitoring`
			`annotations:`
			`summary: "Prometheus scrape errors detected"`
			`description: "Prometheus is experiencing scrape errors for target {{ $labels.job }}."`
			`runbook_url: "https://runbooks.bakery-ia.local/PrometheusScrapeErrors"`