Improve monitoring for prod

2026-01-07 19:12:35 +01:00
parent 560c7ba86f
commit 07178f8972
44 changed files with 6581 additions and 5111 deletions
--- a/infrastructure/monitoring/grafana/dashboards/alert-system-dashboard.json
+++ b/infrastructure/monitoring/grafana/dashboards/alert-system-dashboard.json
@@ -1,644 +0,0 @@
-{
-  "annotations": {
-    "list": [
-      {
-        "builtIn": 1,
-        "datasource": "-- Grafana --",
-        "enable": true,
-        "hide": true,
-        "iconColor": "rgba(0, 211, 255, 1)",
-        "name": "Annotations & Alerts",
-        "type": "dashboard"
-      }
-    ]
-  },
-  "description": "Comprehensive monitoring dashboard for the Bakery Alert and Recommendation System",
-  "editable": true,
-  "fiscalYearStartMonth": 0,
-  "graphTooltip": 0,
-  "id": null,
-  "links": [],
-  "liveNow": false,
-  "panels": [
-    {
-      "datasource": "prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "vis": false
-            },
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 0
-      },
-      "id": 1,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom"
-        },
-        "tooltip": {
-          "mode": "single"
-        }
-      },
-      "targets": [
-        {
-          "expr": "rate(alert_items_published_total[5m])",
-          "interval": "",
-          "legendFormat": "{{item_type}} - {{severity}}",
-          "refId": "A"
-        }
-      ],
-      "title": "Alert/Recommendation Publishing Rate",
-      "type": "timeseries"
-    },
-    {
-      "datasource": "prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          }
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 0
-      },
-      "id": 2,
-      "options": {
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "showThresholdLabels": false,
-        "showThresholdMarkers": true,
-        "text": {}
-      },
-      "pluginVersion": "8.0.0",
-      "targets": [
-        {
-          "expr": "sum(alert_sse_active_connections)",
-          "interval": "",
-          "legendFormat": "Active SSE Connections",
-          "refId": "A"
-        }
-      ],
-      "title": "Active SSE Connections",
-      "type": "gauge"
-    },
-    {
-      "datasource": "prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "vis": false
-            }
-          },
-          "mappings": []
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 8,
-        "x": 0,
-        "y": 8
-      },
-      "id": 3,
-      "options": {
-        "legend": {
-          "displayMode": "list",
-          "placement": "right"
-        },
-        "pieType": "pie",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "tooltip": {
-          "mode": "single"
-        }
-      },
-      "targets": [
-        {
-          "expr": "sum by (item_type) (alert_items_published_total)",
-          "interval": "",
-          "legendFormat": "{{item_type}}",
-          "refId": "A"
-        }
-      ],
-      "title": "Items by Type",
-      "type": "piechart"
-    },
-    {
-      "datasource": "prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "vis": false
-            }
-          },
-          "mappings": []
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 8,
-        "x": 8,
-        "y": 8
-      },
-      "id": 4,
-      "options": {
-        "legend": {
-          "displayMode": "list",
-          "placement": "right"
-        },
-        "pieType": "pie",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "tooltip": {
-          "mode": "single"
-        }
-      },
-      "targets": [
-        {
-          "expr": "sum by (severity) (alert_items_published_total)",
-          "interval": "",
-          "legendFormat": "{{severity}}",
-          "refId": "A"
-        }
-      ],
-      "title": "Items by Severity",
-      "type": "piechart"
-    },
-    {
-      "datasource": "prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "vis": false
-            },
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 8,
-        "x": 16,
-        "y": 8
-      },
-      "id": 5,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom"
-        },
-        "tooltip": {
-          "mode": "single"
-        }
-      },
-      "targets": [
-        {
-          "expr": "rate(alert_notifications_sent_total[5m])",
-          "interval": "",
-          "legendFormat": "{{channel}}",
-          "refId": "A"
-        }
-      ],
-      "title": "Notification Delivery Rate by Channel",
-      "type": "timeseries"
-    },
-    {
-      "datasource": "prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "vis": false
-            },
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "s"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 16
-      },
-      "id": 6,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom"
-        },
-        "tooltip": {
-          "mode": "single"
-        }
-      },
-      "targets": [
-        {
-          "expr": "histogram_quantile(0.95, rate(alert_processing_duration_seconds_bucket[5m]))",
-          "interval": "",
-          "legendFormat": "95th percentile",
-          "refId": "A"
-        },
-        {
-          "expr": "histogram_quantile(0.50, rate(alert_processing_duration_seconds_bucket[5m]))",
-          "interval": "",
-          "legendFormat": "50th percentile (median)",
-          "refId": "B"
-        }
-      ],
-      "title": "Processing Duration",
-      "type": "timeseries"
-    },
-    {
-      "datasource": "prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "vis": false
-            },
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 16
-      },
-      "id": 7,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom"
-        },
-        "tooltip": {
-          "mode": "single"
-        }
-      },
-      "targets": [
-        {
-          "expr": "rate(alert_processing_errors_total[5m])",
-          "interval": "",
-          "legendFormat": "{{error_type}}",
-          "refId": "A"
-        },
-        {
-          "expr": "rate(alert_delivery_failures_total[5m])",
-          "interval": "",
-          "legendFormat": "Delivery: {{channel}}",
-          "refId": "B"
-        }
-      ],
-      "title": "Error Rates",
-      "type": "timeseries"
-    },
-    {
-      "datasource": "prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "custom": {
-            "align": "auto",
-            "displayMode": "auto"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          }
-        },
-        "overrides": [
-          {
-            "matcher": {
-              "id": "byName",
-              "options": "Health"
-            },
-            "properties": [
-              {
-                "id": "custom.displayMode",
-                "value": "color-background"
-              },
-              {
-                "id": "mappings",
-                "value": [
-                  {
-                    "options": {
-                      "0": {
-                        "color": "red",
-                        "index": 0,
-                        "text": "Unhealthy"
-                      },
-                      "1": {
-                        "color": "green",
-                        "index": 1,
-                        "text": "Healthy"
-                      }
-                    },
-                    "type": "value"
-                  }
-                ]
-              }
-            ]
-          }
-        ]
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 24,
-        "x": 0,
-        "y": 24
-      },
-      "id": 8,
-      "options": {
-        "showHeader": true
-      },
-      "pluginVersion": "8.0.0",
-      "targets": [
-        {
-          "expr": "alert_system_component_health",
-          "format": "table",
-          "interval": "",
-          "legendFormat": "",
-          "refId": "A"
-        }
-      ],
-      "title": "System Component Health",
-      "transformations": [
-        {
-          "id": "organize",
-          "options": {
-            "excludeByName": {
-              "__name__": true,
-              "instance": true,
-              "job": true
-            },
-            "indexByName": {},
-            "renameByName": {
-              "Value": "Health",
-              "component": "Component",
-              "service": "Service"
-            }
-          }
-        }
-      ],
-      "type": "table"
-    }
-  ],
-  "schemaVersion": 27,
-  "style": "dark",
-  "tags": [
-    "bakery",
-    "alerts",
-    "recommendations",
-    "monitoring"
-  ],
-  "templating": {
-    "list": []
-  },
-  "time": {
-    "from": "now-1h",
-    "to": "now"
-  },
-  "timepicker": {},
-  "timezone": "Europe/Madrid",
-  "title": "Bakery Alert & Recommendation System",
-  "uid": "bakery-alert-system",
-  "version": 1
-}
--- a/infrastructure/monitoring/grafana/dashboards/dashboard.yml
+++ b/infrastructure/monitoring/grafana/dashboards/dashboard.yml
@@ -1,15 +0,0 @@
-# infrastructure/monitoring/grafana/dashboards/dashboard.yml
-# Grafana dashboard provisioning
-
-apiVersion: 1
-
-providers:
-  - name: 'bakery-dashboards'
-    orgId: 1
-    folder: 'Bakery Forecasting'
-    type: file
-    disableDeletion: false
-    updateIntervalSeconds: 10
-    allowUiUpdates: true
-    options:
-      path: /etc/grafana/provisioning/dashboards
--- a/infrastructure/monitoring/grafana/datasources/prometheus.yml
+++ b/infrastructure/monitoring/grafana/datasources/prometheus.yml
@@ -1,28 +0,0 @@
-# infrastructure/monitoring/grafana/datasources/prometheus.yml
-# Grafana Prometheus datasource configuration
-
-apiVersion: 1
-
-datasources:
-  - name: Prometheus
-    type: prometheus
-    access: proxy
-    url: http://prometheus:9090
-    isDefault: true
-    version: 1
-    editable: true
-    jsonData:
-      timeInterval: "15s"
-      queryTimeout: "60s"
-      httpMethod: "POST"
-      exemplarTraceIdDestinations:
-        - name: trace_id
-          datasourceUid: jaeger
-
-  - name: Jaeger
-    type: jaeger
-    access: proxy
-    url: http://jaeger:16686
-    uid: jaeger
-    version: 1
-    editable: true
--- a/infrastructure/monitoring/prometheus/forecasting-service.yml
+++ b/infrastructure/monitoring/prometheus/forecasting-service.yml
@@ -1,42 +0,0 @@
-# ================================================================
-# Monitoring Configuration: infrastructure/monitoring/prometheus/forecasting-service.yml
-# ================================================================
-groups:
- name: forecasting-service
-  rules:
-  - alert: ForecastingServiceDown
-    expr: up{job="forecasting-service"} == 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Forecasting service is down"
-      description: "Forecasting service has been down for more than 1 minute"
-
-  - alert: HighForecastingLatency
-    expr: histogram_quantile(0.95, forecast_processing_time_seconds) > 10
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: "High forecasting latency"
-      description: "95th percentile forecasting latency is {{ $value }}s"
-
-  - alert: ForecastingErrorRate
-    expr: rate(forecasting_errors_total[5m]) > 0.1
-    for: 5m
-    labels:
-      severity: critical
-    annotations:
-      summary: "High forecasting error rate"
-      description: "Forecasting error rate is {{ $value }} errors/sec"
-
-  - alert: LowModelAccuracy
-    expr: avg(model_accuracy_score) < 0.7
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Low model accuracy detected"
-      description: "Average model accuracy is {{ $value }}"
-
--- a/infrastructure/monitoring/prometheus/prometheus.yml
+++ b/infrastructure/monitoring/prometheus/prometheus.yml
@@ -1,88 +0,0 @@
-# infrastructure/monitoring/prometheus/prometheus.yml
-# Prometheus configuration
-
-global:
-  scrape_interval: 15s
-  evaluation_interval: 15s
-  external_labels:
-    cluster: 'bakery-forecasting'
-    replica: 'prometheus-01'
-
-rule_files:
-  - "/etc/prometheus/rules/*.yml"
-
-alerting:
-  alertmanagers:
-    - static_configs:
-        - targets:
-          # - alertmanager:9093
-
-scrape_configs:
-  # Service discovery for microservices
-  - job_name: 'gateway'
-    static_configs:
-      - targets: ['gateway-service:8000']
-    metrics_path: '/metrics'
-    scrape_interval: 30s
-    scrape_timeout: 10s
-
-  - job_name: 'auth-service'
-    static_configs:
-      - targets: ['auth-service:8000']
-    metrics_path: '/metrics'
-    scrape_interval: 30s
-
-  - job_name: 'tenant-service'
-    static_configs:
-      - targets: ['tenant-service:8000']
-    metrics_path: '/metrics'
-    scrape_interval: 30s
-
-  - job_name: 'training-service'
-    static_configs:
-      - targets: ['training-service:8000']
-    metrics_path: '/metrics'
-    scrape_interval: 30s
-
-  - job_name: 'forecasting-service'
-    static_configs:
-      - targets: ['forecasting-service:8000']
-    metrics_path: '/metrics'
-    scrape_interval: 30s
-
-  - job_name: 'sales-service'
-    static_configs:
-      - targets: ['sales-service:8000']
-    metrics_path: '/metrics'
-    scrape_interval: 30s
-
-  - job_name: 'external-service'
-    static_configs:
-      - targets: ['external-service:8000']
-    metrics_path: '/metrics'
-    scrape_interval: 30s
-
-  - job_name: 'notification-service'
-    static_configs:
-      - targets: ['notification-service:8000']
-    metrics_path: '/metrics'
-    scrape_interval: 30s
-
-  # Infrastructure monitoring
-  - job_name: 'redis'
-    static_configs:
-      - targets: ['redis:6379']
-    metrics_path: '/metrics'
-    scrape_interval: 30s
-
-  - job_name: 'rabbitmq'
-    static_configs:
-      - targets: ['rabbitmq:15692']
-    metrics_path: '/metrics'
-    scrape_interval: 30s
-
-  # Database monitoring (requires postgres_exporter)
-  - job_name: 'postgres'
-    static_configs:
-      - targets: ['postgres-exporter:9187']
-    scrape_interval: 30s
--- a/infrastructure/monitoring/prometheus/rules/alert-system-rules.yml
+++ b/infrastructure/monitoring/prometheus/rules/alert-system-rules.yml
@@ -1,243 +0,0 @@
-# infrastructure/monitoring/prometheus/rules/alert-system-rules.yml
-# Prometheus alerting rules for the Bakery Alert and Recommendation System
-
-groups:
-  - name: alert_system_health
-    rules:
-      # System component health alerts
-      - alert: AlertSystemComponentDown
-        expr: alert_system_component_health == 0
-        for: 2m
-        labels:
-          severity: critical
-          service: "{{ $labels.service }}"
-          component: "{{ $labels.component }}"
-        annotations:
-          summary: "Alert system component {{ $labels.component }} is unhealthy"
-          description: "Component {{ $labels.component }} in service {{ $labels.service }} has been unhealthy for more than 2 minutes."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#component-health"
-
-      # Connection health alerts
-      - alert: RabbitMQConnectionDown
-        expr: alert_rabbitmq_connection_status == 0
-        for: 1m
-        labels:
-          severity: critical
-          service: "{{ $labels.service }}"
-        annotations:
-          summary: "RabbitMQ connection down for {{ $labels.service }}"
-          description: "Service {{ $labels.service }} has lost connection to RabbitMQ for more than 1 minute."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#rabbitmq-connection"
-
-      - alert: RedisConnectionDown
-        expr: alert_redis_connection_status == 0
-        for: 1m
-        labels:
-          severity: critical
-          service: "{{ $labels.service }}"
-        annotations:
-          summary: "Redis connection down for {{ $labels.service }}"
-          description: "Service {{ $labels.service }} has lost connection to Redis for more than 1 minute."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#redis-connection"
-
-      # Leader election issues
-      - alert: NoSchedulerLeader
-        expr: sum(alert_scheduler_leader_status) == 0
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "No scheduler leader elected"
-          description: "No service has been elected as scheduler leader for more than 5 minutes. Scheduled checks may not be running."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#leader-election"
-
-  - name: alert_system_performance
-    rules:
-      # High error rates
-      - alert: HighAlertProcessingErrorRate
-        expr: rate(alert_processing_errors_total[5m]) > 0.1
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High alert processing error rate"
-          description: "Alert processing error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-errors"
-
-      - alert: HighNotificationDeliveryFailureRate
-        expr: rate(alert_delivery_failures_total[5m]) / rate(alert_notifications_sent_total[5m]) > 0.05
-        for: 3m
-        labels:
-          severity: warning
-          channel: "{{ $labels.channel }}"
-        annotations:
-          summary: "High notification delivery failure rate for {{ $labels.channel }}"
-          description: "Notification delivery failure rate for {{ $labels.channel }} is {{ $value | humanizePercentage }} over the last 5 minutes."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#delivery-failures"
-
-      # Processing latency
-      - alert: HighAlertProcessingLatency
-        expr: histogram_quantile(0.95, rate(alert_processing_duration_seconds_bucket[5m])) > 5
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High alert processing latency"
-          description: "95th percentile alert processing latency is {{ $value }}s, exceeding 5s threshold."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-latency"
-
-      # SSE connection issues
-      - alert: TooManySSEConnections
-        expr: sum(alert_sse_active_connections) > 1000
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: "Too many active SSE connections"
-          description: "Number of active SSE connections ({{ $value }}) exceeds 1000. This may impact performance."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-connections"
-
-      - alert: SSEConnectionErrors
-        expr: rate(alert_sse_connection_errors_total[5m]) > 0.5
-        for: 3m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High SSE connection error rate"
-          description: "SSE connection error rate is {{ $value }} errors/second over the last 5 minutes."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-errors"
-
-  - name: alert_system_business
-    rules:
-      # Alert volume anomalies
-      - alert: UnusuallyHighAlertVolume
-        expr: rate(alert_items_published_total{item_type="alert"}[10m]) > 2
-        for: 5m
-        labels:
-          severity: warning
-          service: "{{ $labels.service }}"
-        annotations:
-          summary: "Unusually high alert volume from {{ $labels.service }}"
-          description: "Service {{ $labels.service }} is generating alerts at {{ $value }} alerts/second, which is above normal levels."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#high-volume"
-
-      - alert: NoAlertsGenerated
-        expr: rate(alert_items_published_total[30m]) == 0
-        for: 15m
-        labels:
-          severity: warning
-        annotations:
-          summary: "No alerts generated recently"
-          description: "No alerts have been generated in the last 30 minutes. This may indicate a problem with detection systems."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#no-alerts"
-
-      # Response time issues
-      - alert: SlowAlertResponseTime
-        expr: histogram_quantile(0.95, rate(alert_item_response_time_seconds_bucket[1h])) > 3600
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary: "Slow alert response times"
-          description: "95th percentile alert response time is {{ $value | humanizeDuration }}, exceeding 1 hour."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#response-times"
-
-      # Critical alerts not acknowledged
-      - alert: CriticalAlertsUnacknowledged
-        expr: sum(alert_active_items_current{item_type="alert",severity="urgent"}) > 5
-        for: 10m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Multiple critical alerts unacknowledged"
-          description: "{{ $value }} critical alerts remain unacknowledged for more than 10 minutes."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#critical-unacked"
-
-  - name: alert_system_capacity
-    rules:
-      # Queue size monitoring
-      - alert: LargeSSEMessageQueues
-        expr: alert_sse_message_queue_size > 100
-        for: 5m
-        labels:
-          severity: warning
-          tenant_id: "{{ $labels.tenant_id }}"
-        annotations:
-          summary: "Large SSE message queue for tenant {{ $labels.tenant_id }}"
-          description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages, indicating potential client issues."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-queues"
-
-      # Database storage issues
-      - alert: SlowDatabaseStorage
-        expr: histogram_quantile(0.95, rate(alert_database_storage_duration_seconds_bucket[5m])) > 1
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "Slow database storage for alerts"
-          description: "95th percentile database storage time is {{ $value }}s, exceeding 1s threshold."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#database-storage"
-
-  - name: alert_system_effectiveness
-    rules:
-      # False positive rate monitoring
-      - alert: HighFalsePositiveRate
-        expr: alert_false_positive_rate > 0.2
-        for: 30m
-        labels:
-          severity: warning
-          service: "{{ $labels.service }}"
-          alert_type: "{{ $labels.alert_type }}"
-        annotations:
-          summary: "High false positive rate for {{ $labels.alert_type }}"
-          description: "False positive rate for {{ $labels.alert_type }} in {{ $labels.service }} is {{ $value | humanizePercentage }}."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#false-positives"
-
-      # Low recommendation adoption
-      - alert: LowRecommendationAdoption
-        expr: rate(alert_recommendations_implemented_total[24h]) / rate(alert_items_published_total{item_type="recommendation"}[24h]) < 0.1
-        for: 1h
-        labels:
-          severity: info
-          service: "{{ $labels.service }}"
-        annotations:
-          summary: "Low recommendation adoption rate"
-          description: "Recommendation adoption rate for {{ $labels.service }} is {{ $value | humanizePercentage }} over the last 24 hours."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#recommendation-adoption"
-
-# Additional alerting rules for specific scenarios
-  - name: alert_system_critical_scenarios
-    rules:
-      # Complete system failure
-      - alert: AlertSystemDown
-        expr: up{job=~"alert-processor|notification-service"} == 0
-        for: 1m
-        labels:
-          severity: critical
-          service: "{{ $labels.job }}"
-        annotations:
-          summary: "Alert system service {{ $labels.job }} is down"
-          description: "Critical alert system service {{ $labels.job }} has been down for more than 1 minute."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#service-down"
-
-      # Data loss prevention
-      - alert: AlertDataNotPersisted
-        expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_database_storage_duration_seconds_count[5m]) == 0
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Alert data not being persisted to database"
-          description: "Alerts are being processed but not stored in database, potential data loss."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#data-persistence"
-
-      # Notification blackhole
-      - alert: NotificationsNotDelivered
-        expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_notifications_sent_total[5m]) == 0
-        for: 3m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Notifications not being delivered"
-          description: "Alerts are being processed but no notifications are being sent."
-          runbook_url: "https://docs.bakery.local/runbooks/alert-system#notification-delivery"
--- a/infrastructure/monitoring/prometheus/rules/alerts.yml
+++ b/infrastructure/monitoring/prometheus/rules/alerts.yml
@@ -1,86 +0,0 @@
-# infrastructure/monitoring/prometheus/rules/alerts.yml
-# Prometheus alerting rules
-
-groups:
-  - name: bakery_services
-    rules:
-      # Service availability alerts
-      - alert: ServiceDown
-        expr: up == 0
-        for: 2m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Service {{ $labels.job }} is down"
-          description: "Service {{ $labels.job }} has been down for more than 2 minutes."
-
-      # High error rate alerts
-      - alert: HighErrorRate
-        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High error rate on {{ $labels.job }}"
-          description: "Error rate is {{ $value }} errors per second on {{ $labels.job }}."
-
-      # High response time alerts
-      - alert: HighResponseTime
-        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High response time on {{ $labels.job }}"
-          description: "95th percentile response time is {{ $value }}s on {{ $labels.job }}."
-
-      # Memory usage alerts
-      - alert: HighMemoryUsage
-        expr: process_resident_memory_bytes / 1024 / 1024 > 500
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High memory usage on {{ $labels.job }}"
-          description: "Memory usage is {{ $value }}MB on {{ $labels.job }}."
-
-      # Database connection alerts
-      - alert: DatabaseConnectionHigh
-        expr: pg_stat_activity_count > 80
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High database connections"
-          description: "Database has {{ $value }} active connections."
-
-  - name: bakery_business
-    rules:
-      # Training job alerts
-      - alert: TrainingJobFailed
-        expr: increase(training_jobs_failed_total[1h]) > 0
-        labels:
-          severity: warning
-        annotations:
-          summary: "Training job failed"
-          description: "{{ $value }} training jobs have failed in the last hour."
-
-      # Prediction accuracy alerts
-      - alert: LowPredictionAccuracy
-        expr: prediction_accuracy < 0.7
-        for: 15m
-        labels:
-          severity: warning
-        annotations:
-          summary: "Low prediction accuracy"
-          description: "Prediction accuracy is {{ $value }} for tenant {{ $labels.tenant_id }}."
-
-      # API rate limit alerts
-      - alert: APIRateLimitHit
-        expr: increase(rate_limit_hits_total[5m]) > 10
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "API rate limit hit frequently"
-          description: "Rate limit has been hit {{ $value }} times in 5 minutes."