Add signoz

2026-01-08 12:58:00 +01:00
parent 07178f8972
commit dfb7e4b237
40 changed files with 2049 additions and 3935 deletions
--- a/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml
+++ b/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml
@@ -48,6 +48,9 @@ spec:
            name: pos-integration-secrets
        - secretRef:
            name: whatsapp-secrets
+        env:
+        - name: OTEL_EXPORTER_OTLP_ENDPOINT
+          value: "http://otel-collector.monitoring.svc.cluster.local:4317"
        resources:
          requests:
            memory: "256Mi"
--- a/infrastructure/kubernetes/base/components/monitoring/alert-rules.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/alert-rules.yaml
@@ -1,429 +0,0 @@
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: prometheus-alert-rules
-  namespace: monitoring
-data:
-  alert-rules.yml: |
-    groups:
-    # Basic Infrastructure Alerts
-    - name: bakery_services
-      interval: 30s
-      rules:
-      - alert: ServiceDown
-        expr: up{job="bakery-services"} == 0
-        for: 2m
-        labels:
-          severity: critical
-          component: infrastructure
-        annotations:
-          summary: "Service {{ $labels.service }} is down"
-          description: "Service {{ $labels.service }} in namespace {{ $labels.namespace }} has been down for more than 2 minutes."
-          runbook_url: "https://runbooks.bakery-ia.local/ServiceDown"
-
-      - alert: HighErrorRate
-        expr: |
-          (
-            sum(rate(http_requests_total{status_code=~"5..", job="bakery-services"}[5m])) by (service)
-            /
-            sum(rate(http_requests_total{job="bakery-services"}[5m])) by (service)
-          ) > 0.10
-        for: 5m
-        labels:
-          severity: critical
-          component: application
-        annotations:
-          summary: "High error rate on {{ $labels.service }}"
-          description: "Service {{ $labels.service }} has error rate above 10% (current: {{ $value | humanizePercentage }})."
-          runbook_url: "https://runbooks.bakery-ia.local/HighErrorRate"
-
-      - alert: HighResponseTime
-        expr: |
-          histogram_quantile(0.95,
-            sum(rate(http_request_duration_seconds_bucket{job="bakery-services"}[5m])) by (service, le)
-          ) > 1
-        for: 5m
-        labels:
-          severity: warning
-          component: performance
-        annotations:
-          summary: "High response time on {{ $labels.service }}"
-          description: "Service {{ $labels.service }} P95 latency is above 1 second (current: {{ $value }}s)."
-          runbook_url: "https://runbooks.bakery-ia.local/HighResponseTime"
-
-      - alert: HighMemoryUsage
-        expr: |
-          container_memory_usage_bytes{namespace="bakery-ia", container!=""} > 500000000
-        for: 5m
-        labels:
-          severity: warning
-          component: infrastructure
-        annotations:
-          summary: "High memory usage in {{ $labels.pod }}"
-          description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using more than 500MB of memory (current: {{ $value | humanize }}B)."
-          runbook_url: "https://runbooks.bakery-ia.local/HighMemoryUsage"
-
-      - alert: DatabaseConnectionHigh
-        expr: |
-          pg_stat_database_numbackends{datname="bakery"} > 80
-        for: 5m
-        labels:
-          severity: warning
-          component: database
-        annotations:
-          summary: "High database connection count"
-          description: "Database has more than 80 active connections (current: {{ $value }})."
-          runbook_url: "https://runbooks.bakery-ia.local/DatabaseConnectionHigh"
-
-    # Business Logic Alerts
-    - name: bakery_business
-      interval: 30s
-      rules:
-      - alert: TrainingJobFailed
-        expr: |
-          increase(training_job_failures_total[1h]) > 0
-        for: 5m
-        labels:
-          severity: warning
-          component: ml-training
-        annotations:
-          summary: "Training job failures detected"
-          description: "{{ $value }} training job(s) failed in the last hour."
-          runbook_url: "https://runbooks.bakery-ia.local/TrainingJobFailed"
-
-      - alert: LowPredictionAccuracy
-        expr: |
-          prediction_model_accuracy < 0.70
-        for: 15m
-        labels:
-          severity: warning
-          component: ml-inference
-        annotations:
-          summary: "Model prediction accuracy is low"
-          description: "Model {{ $labels.model_name }} accuracy is below 70% (current: {{ $value | humanizePercentage }})."
-          runbook_url: "https://runbooks.bakery-ia.local/LowPredictionAccuracy"
-
-      - alert: APIRateLimitHit
-        expr: |
-          increase(rate_limit_hits_total[5m]) > 10
-        for: 5m
-        labels:
-          severity: info
-          component: api-gateway
-        annotations:
-          summary: "API rate limits being hit frequently"
-          description: "Rate limits hit {{ $value }} times in the last 5 minutes."
-          runbook_url: "https://runbooks.bakery-ia.local/APIRateLimitHit"
-
-    # Alert System Health
-    - name: alert_system_health
-      interval: 30s
-      rules:
-      - alert: AlertSystemComponentDown
-        expr: |
-          alert_system_component_health{component=~"processor|notifier|scheduler"} == 0
-        for: 2m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "Alert system component {{ $labels.component }} is unhealthy"
-          description: "Component {{ $labels.component }} has been unhealthy for more than 2 minutes."
-          runbook_url: "https://runbooks.bakery-ia.local/AlertSystemComponentDown"
-
-      - alert: RabbitMQConnectionDown
-        expr: |
-          rabbitmq_up == 0
-        for: 1m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "RabbitMQ connection is down"
-          description: "Alert system has lost connection to RabbitMQ message queue."
-          runbook_url: "https://runbooks.bakery-ia.local/RabbitMQConnectionDown"
-
-      - alert: RedisConnectionDown
-        expr: |
-          redis_up == 0
-        for: 1m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "Redis connection is down"
-          description: "Alert system has lost connection to Redis cache."
-          runbook_url: "https://runbooks.bakery-ia.local/RedisConnectionDown"
-
-      - alert: NoSchedulerLeader
-        expr: |
-          sum(alert_system_scheduler_leader) == 0
-        for: 5m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "No alert scheduler leader elected"
-          description: "No scheduler instance has been elected as leader for 5 minutes."
-          runbook_url: "https://runbooks.bakery-ia.local/NoSchedulerLeader"
-
-    # Alert System Performance
-    - name: alert_system_performance
-      interval: 30s
-      rules:
-      - alert: HighAlertProcessingErrorRate
-        expr: |
-          (
-            sum(rate(alert_processing_errors_total[2m]))
-            /
-            sum(rate(alerts_processed_total[2m]))
-          ) > 0.10
-        for: 2m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "High alert processing error rate"
-          description: "Alert processing error rate is above 10% (current: {{ $value | humanizePercentage }})."
-          runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingErrorRate"
-
-      - alert: HighNotificationDeliveryFailureRate
-        expr: |
-          (
-            sum(rate(notification_delivery_failures_total[3m]))
-            /
-            sum(rate(notifications_sent_total[3m]))
-          ) > 0.05
-        for: 3m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "High notification delivery failure rate"
-          description: "Notification delivery failure rate is above 5% (current: {{ $value | humanizePercentage }})."
-          runbook_url: "https://runbooks.bakery-ia.local/HighNotificationDeliveryFailureRate"
-
-      - alert: HighAlertProcessingLatency
-        expr: |
-          histogram_quantile(0.95,
-            sum(rate(alert_processing_duration_seconds_bucket[5m])) by (le)
-          ) > 5
-        for: 5m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "High alert processing latency"
-          description: "P95 alert processing latency is above 5 seconds (current: {{ $value }}s)."
-          runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingLatency"
-
-      - alert: TooManySSEConnections
-        expr: |
-          sse_active_connections > 1000
-        for: 2m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "Too many active SSE connections"
-          description: "More than 1000 active SSE connections (current: {{ $value }})."
-          runbook_url: "https://runbooks.bakery-ia.local/TooManySSEConnections"
-
-      - alert: SSEConnectionErrors
-        expr: |
-          rate(sse_connection_errors_total[3m]) > 0.5
-        for: 3m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "High rate of SSE connection errors"
-          description: "SSE connection error rate is {{ $value }} errors/sec."
-          runbook_url: "https://runbooks.bakery-ia.local/SSEConnectionErrors"
-
-    # Alert System Business Logic
-    - name: alert_system_business
-      interval: 30s
-      rules:
-      - alert: UnusuallyHighAlertVolume
-        expr: |
-          rate(alerts_generated_total[5m]) > 2
-        for: 5m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "Unusually high alert generation volume"
-          description: "More than 2 alerts per second being generated (current: {{ $value }}/sec)."
-          runbook_url: "https://runbooks.bakery-ia.local/UnusuallyHighAlertVolume"
-
-      - alert: NoAlertsGenerated
-        expr: |
-          rate(alerts_generated_total[30m]) == 0
-        for: 15m
-        labels:
-          severity: info
-          component: alert-system
-        annotations:
-          summary: "No alerts generated recently"
-          description: "No alerts have been generated in the last 30 minutes. This might indicate a problem with alert detection."
-          runbook_url: "https://runbooks.bakery-ia.local/NoAlertsGenerated"
-
-      - alert: SlowAlertResponseTime
-        expr: |
-          histogram_quantile(0.95,
-            sum(rate(alert_response_time_seconds_bucket[10m])) by (le)
-          ) > 3600
-        for: 10m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "Slow alert response times"
-          description: "P95 alert response time is above 1 hour (current: {{ $value | humanizeDuration }})."
-          runbook_url: "https://runbooks.bakery-ia.local/SlowAlertResponseTime"
-
-      - alert: CriticalAlertsUnacknowledged
-        expr: |
-          sum(alerts_unacknowledged{severity="critical"}) > 5
-        for: 10m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "Multiple critical alerts unacknowledged"
-          description: "{{ $value }} critical alerts have not been acknowledged for 10+ minutes."
-          runbook_url: "https://runbooks.bakery-ia.local/CriticalAlertsUnacknowledged"
-
-    # Alert System Capacity
-    - name: alert_system_capacity
-      interval: 30s
-      rules:
-      - alert: LargeSSEMessageQueues
-        expr: |
-          sse_message_queue_size > 100
-        for: 5m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "Large SSE message queues detected"
-          description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages queued."
-          runbook_url: "https://runbooks.bakery-ia.local/LargeSSEMessageQueues"
-
-      - alert: SlowDatabaseStorage
-        expr: |
-          histogram_quantile(0.95,
-            sum(rate(alert_storage_duration_seconds_bucket[5m])) by (le)
-          ) > 1
-        for: 5m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "Slow alert database storage"
-          description: "P95 alert storage latency is above 1 second (current: {{ $value }}s)."
-          runbook_url: "https://runbooks.bakery-ia.local/SlowDatabaseStorage"
-
-    # Alert System Critical Scenarios
-    - name: alert_system_critical
-      interval: 15s
-      rules:
-      - alert: AlertSystemDown
-        expr: |
-          up{service=~"alert-processor|notification-service"} == 0
-        for: 1m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "Alert system is completely down"
-          description: "Core alert system service {{ $labels.service }} is down."
-          runbook_url: "https://runbooks.bakery-ia.local/AlertSystemDown"
-
-      - alert: AlertDataNotPersisted
-        expr: |
-          (
-            sum(rate(alerts_processed_total[2m]))
-            -
-            sum(rate(alerts_stored_total[2m]))
-          ) > 0
-        for: 2m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "Alerts not being persisted to database"
-          description: "Alerts are being processed but not stored in the database."
-          runbook_url: "https://runbooks.bakery-ia.local/AlertDataNotPersisted"
-
-      - alert: NotificationsNotDelivered
-        expr: |
-          (
-            sum(rate(alerts_processed_total[3m]))
-            -
-            sum(rate(notifications_sent_total[3m]))
-          ) > 0
-        for: 3m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "Notifications not being delivered"
-          description: "Alerts are being processed but notifications are not being sent."
-          runbook_url: "https://runbooks.bakery-ia.local/NotificationsNotDelivered"
-
-    # Monitoring System Self-Monitoring
-    - name: monitoring_health
-      interval: 30s
-      rules:
-      - alert: PrometheusDown
-        expr: up{job="prometheus"} == 0
-        for: 5m
-        labels:
-          severity: critical
-          component: monitoring
-        annotations:
-          summary: "Prometheus is down"
-          description: "Prometheus monitoring system is not responding."
-          runbook_url: "https://runbooks.bakery-ia.local/PrometheusDown"
-
-      - alert: AlertManagerDown
-        expr: up{job="alertmanager"} == 0
-        for: 2m
-        labels:
-          severity: critical
-          component: monitoring
-        annotations:
-          summary: "AlertManager is down"
-          description: "AlertManager is not responding. Alerts will not be routed."
-          runbook_url: "https://runbooks.bakery-ia.local/AlertManagerDown"
-
-      - alert: PrometheusStorageFull
-        expr: |
-          (
-            prometheus_tsdb_storage_blocks_bytes
-            /
-            (prometheus_tsdb_storage_blocks_bytes + prometheus_tsdb_wal_size_bytes)
-          ) > 0.90
-        for: 10m
-        labels:
-          severity: warning
-          component: monitoring
-        annotations:
-          summary: "Prometheus storage almost full"
-          description: "Prometheus storage is {{ $value | humanizePercentage }} full."
-          runbook_url: "https://runbooks.bakery-ia.local/PrometheusStorageFull"
-
-      - alert: PrometheusScrapeErrors
-        expr: |
-          rate(prometheus_target_scrapes_exceeded_sample_limit_total[5m]) > 0
-        for: 5m
-        labels:
-          severity: warning
-          component: monitoring
-        annotations:
-          summary: "Prometheus scrape errors detected"
-          description: "Prometheus is experiencing scrape errors for target {{ $labels.job }}."
-          runbook_url: "https://runbooks.bakery-ia.local/PrometheusScrapeErrors"
--- a/infrastructure/kubernetes/base/components/monitoring/alertmanager-init.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/alertmanager-init.yaml
@@ -1,27 +0,0 @@
---
-# InitContainer to substitute secrets into AlertManager config
-# This allows us to use environment variables from secrets in the config file
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: alertmanager-init-script
-  namespace: monitoring
-data:
-  init-config.sh: |
-    #!/bin/sh
-    set -e
-
-    # Read the template config
-    TEMPLATE=$(cat /etc/alertmanager-template/alertmanager.yml)
-
-    # Substitute environment variables
-    echo "$TEMPLATE" | \
-      sed "s|{{ .smtp_host }}|${SMTP_HOST}|g" | \
-      sed "s|{{ .smtp_from }}|${SMTP_FROM}|g" | \
-      sed "s|{{ .smtp_username }}|${SMTP_USERNAME}|g" | \
-      sed "s|{{ .smtp_password }}|${SMTP_PASSWORD}|g" | \
-      sed "s|{{ .slack_webhook_url }}|${SLACK_WEBHOOK_URL}|g" \
-      > /etc/alertmanager-final/alertmanager.yml
-
-    echo "AlertManager config initialized successfully"
-    cat /etc/alertmanager-final/alertmanager.yml
--- a/infrastructure/kubernetes/base/components/monitoring/alertmanager.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/alertmanager.yaml
@@ -1,391 +0,0 @@
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: alertmanager-config
-  namespace: monitoring
-data:
-  alertmanager.yml: |
-    global:
-      resolve_timeout: 5m
-      smtp_smarthost: '{{ .smtp_host }}'
-      smtp_from: '{{ .smtp_from }}'
-      smtp_auth_username: '{{ .smtp_username }}'
-      smtp_auth_password: '{{ .smtp_password }}'
-      smtp_require_tls: true
-
-    # Define notification templates
-    templates:
-    - '/etc/alertmanager/templates/*.tmpl'
-
-    # Route alerts to appropriate receivers
-    route:
-      # Default receiver
-      receiver: 'default-email'
-      # Group alerts by these labels
-      group_by: ['alertname', 'cluster', 'service']
-      # Wait time before sending initial notification
-      group_wait: 10s
-      # Wait time before sending notifications about new alerts in the group
-      group_interval: 10s
-      # Wait time before re-sending a notification
-      repeat_interval: 12h
-
-      # Child routes for specific alert routing
-      routes:
-      # Critical alerts - send immediately to all channels
-      - match:
-          severity: critical
-        receiver: 'critical-alerts'
-        group_wait: 0s
-        group_interval: 5m
-        repeat_interval: 4h
-        continue: true
-
-      # Warning alerts - less urgent
-      - match:
-          severity: warning
-        receiver: 'warning-alerts'
-        group_wait: 30s
-        group_interval: 5m
-        repeat_interval: 12h
-
-      # Alert system specific alerts
-      - match:
-          component: alert-system
-        receiver: 'alert-system-team'
-        group_wait: 10s
-        repeat_interval: 6h
-
-      # Database alerts
-      - match_re:
-          alertname: ^(DatabaseConnectionHigh|SlowDatabaseStorage)$
-        receiver: 'database-team'
-        group_wait: 30s
-        repeat_interval: 8h
-
-      # Infrastructure alerts
-      - match_re:
-          alertname: ^(HighMemoryUsage|ServiceDown)$
-        receiver: 'infra-team'
-        group_wait: 30s
-        repeat_interval: 6h
-
-    # Inhibition rules - prevent alert spam
-    inhibit_rules:
-    # If service is down, inhibit all other alerts for that service
-    - source_match:
-        alertname: 'ServiceDown'
-      target_match_re:
-        alertname: '(HighErrorRate|HighResponseTime|HighMemoryUsage)'
-      equal: ['service']
-
-    # If AlertSystem is completely down, inhibit component alerts
-    - source_match:
-        alertname: 'AlertSystemDown'
-      target_match_re:
-        alertname: 'AlertSystemComponent.*'
-      equal: ['namespace']
-
-    # If RabbitMQ is down, inhibit alert processing errors
-    - source_match:
-        alertname: 'RabbitMQConnectionDown'
-      target_match:
-        alertname: 'HighAlertProcessingErrorRate'
-      equal: ['namespace']
-
-    # Receivers - notification destinations
-    receivers:
-    # Default email receiver
-    - name: 'default-email'
-      email_configs:
-      - to: 'alerts@yourdomain.com'
-        headers:
-          Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
-        html: |
-          {{ range .Alerts }}
-          <h2>{{ .Labels.alertname }}</h2>
-          <p><strong>Status:</strong> {{ .Status }}</p>
-          <p><strong>Severity:</strong> {{ .Labels.severity }}</p>
-          <p><strong>Service:</strong> {{ .Labels.service }}</p>
-          <p><strong>Summary:</strong> {{ .Annotations.summary }}</p>
-          <p><strong>Description:</strong> {{ .Annotations.description }}</p>
-          <p><strong>Started:</strong> {{ .StartsAt }}</p>
-          {{ if .EndsAt }}<p><strong>Ended:</strong> {{ .EndsAt }}</p>{{ end }}
-          {{ end }}
-
-    # Critical alerts - multiple channels
-    - name: 'critical-alerts'
-      email_configs:
-      - to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com'
-        headers:
-          Subject: '🚨 [CRITICAL] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
-        send_resolved: true
-      # Uncomment to enable Slack notifications
-      # slack_configs:
-      # - api_url: '{{ .slack_webhook_url }}'
-      #   channel: '#alerts-critical'
-      #   title: '🚨 Critical Alert'
-      #   text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
-      #   send_resolved: true
-
-    # Warning alerts
-    - name: 'warning-alerts'
-      email_configs:
-      - to: 'alerts@yourdomain.com'
-        headers:
-          Subject: '⚠️ [WARNING] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
-        send_resolved: true
-
-    # Alert system team
-    - name: 'alert-system-team'
-      email_configs:
-      - to: 'alert-system-team@yourdomain.com'
-        headers:
-          Subject: '[Alert System] {{ .GroupLabels.alertname }}'
-        send_resolved: true
-
-    # Database team
-    - name: 'database-team'
-      email_configs:
-      - to: 'database-team@yourdomain.com'
-        headers:
-          Subject: '[Database] {{ .GroupLabels.alertname }}'
-        send_resolved: true
-
-    # Infrastructure team
-    - name: 'infra-team'
-      email_configs:
-      - to: 'infra-team@yourdomain.com'
-        headers:
-          Subject: '[Infrastructure] {{ .GroupLabels.alertname }}'
-        send_resolved: true
-
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: alertmanager-templates
-  namespace: monitoring
-data:
-  default.tmpl: |
-    {{ define "cluster" }}{{ .ExternalURL | reReplaceAll ".*alertmanager\\.(.*)" "$1" }}{{ end }}
-
-    {{ define "slack.default.title" }}
-    [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.alertname }}
-    {{ end }}
-
-    {{ define "slack.default.text" }}
-    {{ range .Alerts }}
-    *Alert:* {{ .Annotations.summary }}
-    *Description:* {{ .Annotations.description }}
-    *Severity:* `{{ .Labels.severity }}`
-    *Service:* `{{ .Labels.service }}`
-    {{ end }}
-    {{ end }}
-
---
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
-  name: alertmanager
-  namespace: monitoring
-  labels:
-    app: alertmanager
-spec:
-  serviceName: alertmanager
-  replicas: 3
-  selector:
-    matchLabels:
-      app: alertmanager
-  template:
-    metadata:
-      labels:
-        app: alertmanager
-    spec:
-      serviceAccountName: prometheus
-      initContainers:
-      - name: init-config
-        image: busybox:1.36
-        command: ['/bin/sh', '/scripts/init-config.sh']
-        env:
-        - name: SMTP_HOST
-          valueFrom:
-            secretKeyRef:
-              name: alertmanager-secrets
-              key: smtp-host
-        - name: SMTP_USERNAME
-          valueFrom:
-            secretKeyRef:
-              name: alertmanager-secrets
-              key: smtp-username
-        - name: SMTP_PASSWORD
-          valueFrom:
-            secretKeyRef:
-              name: alertmanager-secrets
-              key: smtp-password
-        - name: SMTP_FROM
-          valueFrom:
-            secretKeyRef:
-              name: alertmanager-secrets
-              key: smtp-from
-        - name: SLACK_WEBHOOK_URL
-          valueFrom:
-            secretKeyRef:
-              name: alertmanager-secrets
-              key: slack-webhook-url
-              optional: true
-        volumeMounts:
-        - name: init-script
-          mountPath: /scripts
-        - name: config-template
-          mountPath: /etc/alertmanager-template
-        - name: config-final
-          mountPath: /etc/alertmanager-final
-      affinity:
-        podAntiAffinity:
-          preferredDuringSchedulingIgnoredDuringExecution:
-          - weight: 100
-            podAffinityTerm:
-              labelSelector:
-                matchExpressions:
-                - key: app
-                  operator: In
-                  values:
-                  - alertmanager
-              topologyKey: kubernetes.io/hostname
-      containers:
-      - name: alertmanager
-        image: prom/alertmanager:v0.27.0
-        args:
-        - '--config.file=/etc/alertmanager/alertmanager.yml'
-        - '--storage.path=/alertmanager'
-        - '--cluster.listen-address=0.0.0.0:9094'
-        - '--cluster.peer=alertmanager-0.alertmanager.monitoring.svc.cluster.local:9094'
-        - '--cluster.peer=alertmanager-1.alertmanager.monitoring.svc.cluster.local:9094'
-        - '--cluster.peer=alertmanager-2.alertmanager.monitoring.svc.cluster.local:9094'
-        - '--cluster.reconnect-timeout=5m'
-        - '--web.external-url=http://monitoring.bakery-ia.local/alertmanager'
-        - '--web.route-prefix=/'
-        ports:
-        - name: web
-          containerPort: 9093
-        - name: mesh-tcp
-          containerPort: 9094
-        - name: mesh-udp
-          containerPort: 9094
-          protocol: UDP
-        env:
-        - name: POD_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.name
-        volumeMounts:
-        - name: config-final
-          mountPath: /etc/alertmanager
-        - name: templates
-          mountPath: /etc/alertmanager/templates
-        - name: storage
-          mountPath: /alertmanager
-        resources:
-          requests:
-            memory: "128Mi"
-            cpu: "100m"
-          limits:
-            memory: "256Mi"
-            cpu: "500m"
-        livenessProbe:
-          httpGet:
-            path: /-/healthy
-            port: 9093
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /-/ready
-            port: 9093
-          initialDelaySeconds: 5
-          periodSeconds: 5
-
-      # Config reloader sidecar
-      - name: configmap-reload
-        image: jimmidyson/configmap-reload:v0.12.0
-        args:
-        - '--webhook-url=http://localhost:9093/-/reload'
-        - '--volume-dir=/etc/alertmanager'
-        volumeMounts:
-        - name: config-final
-          mountPath: /etc/alertmanager
-          readOnly: true
-        resources:
-          requests:
-            memory: "16Mi"
-            cpu: "10m"
-          limits:
-            memory: "32Mi"
-            cpu: "50m"
-
-      volumes:
-      - name: init-script
-        configMap:
-          name: alertmanager-init-script
-          defaultMode: 0755
-      - name: config-template
-        configMap:
-          name: alertmanager-config
-      - name: config-final
-        emptyDir: {}
-      - name: templates
-        configMap:
-          name: alertmanager-templates
-
-  volumeClaimTemplates:
-  - metadata:
-      name: storage
-    spec:
-      accessModes: [ "ReadWriteOnce" ]
-      resources:
-        requests:
-          storage: 2Gi
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: alertmanager
-  namespace: monitoring
-  labels:
-    app: alertmanager
-spec:
-  type: ClusterIP
-  clusterIP: None
-  ports:
-  - name: web
-    port: 9093
-    targetPort: 9093
-  - name: mesh-tcp
-    port: 9094
-    targetPort: 9094
-  - name: mesh-udp
-    port: 9094
-    targetPort: 9094
-    protocol: UDP
-  selector:
-    app: alertmanager
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: alertmanager-external
-  namespace: monitoring
-  labels:
-    app: alertmanager
-spec:
-  type: ClusterIP
-  ports:
-  - name: web
-    port: 9093
-    targetPort: 9093
-  selector:
-    app: alertmanager
--- a/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards-extended.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards-extended.yaml
@@ -1,949 +0,0 @@
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: grafana-dashboards-extended
-  namespace: monitoring
-data:
-  postgresql-dashboard.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - PostgreSQL Database",
-        "tags": ["bakery-ia", "postgresql", "database"],
-        "timezone": "browser",
-        "refresh": "30s",
-        "schemaVersion": 16,
-        "version": 1,
-        "panels": [
-          {
-            "id": 1,
-            "title": "Active Connections by Database",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "pg_stat_activity_count{state=\"active\"}",
-                "legendFormat": "{{datname}} - active"
-              },
-              {
-                "expr": "pg_stat_activity_count{state=\"idle\"}",
-                "legendFormat": "{{datname}} - idle"
-              },
-              {
-                "expr": "pg_stat_activity_count{state=\"idle in transaction\"}",
-                "legendFormat": "{{datname}} - idle tx"
-              }
-            ]
-          },
-          {
-            "id": 2,
-            "title": "Total Connections",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "sum(pg_stat_activity_count)",
-                "legendFormat": "Total connections"
-              }
-            ]
-          },
-          {
-            "id": 3,
-            "title": "Max Connections",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "pg_settings_max_connections",
-                "legendFormat": "Max connections"
-              }
-            ]
-          },
-          {
-            "id": 4,
-            "title": "Transaction Rate (Commits vs Rollbacks)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(pg_stat_database_xact_commit[5m])",
-                "legendFormat": "{{datname}} - commits"
-              },
-              {
-                "expr": "rate(pg_stat_database_xact_rollback[5m])",
-                "legendFormat": "{{datname}} - rollbacks"
-              }
-            ]
-          },
-          {
-            "id": 5,
-            "title": "Cache Hit Ratio",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (1 - (sum(rate(pg_stat_io_blocks_read_total[5m])) / (sum(rate(pg_stat_io_blocks_read_total[5m])) + sum(rate(pg_stat_io_blocks_hit_total[5m])))))",
-                "legendFormat": "Cache hit ratio %"
-              }
-            ]
-          },
-          {
-            "id": 6,
-            "title": "Slow Queries (> 30s)",
-            "type": "table",
-            "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "pg_slow_queries{duration_ms > 30000}",
-                "format": "table",
-                "instant": true
-              }
-            ],
-            "transformations": [
-              {
-                "id": "organize",
-                "options": {
-                  "excludeByName": {},
-                  "indexByName": {},
-                  "renameByName": {
-                    "query": "Query",
-                    "duration_ms": "Duration (ms)",
-                    "datname": "Database"
-                  }
-                }
-              }
-            ]
-          },
-          {
-            "id": 7,
-            "title": "Dead Tuples by Table",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "pg_stat_user_tables_n_dead_tup",
-                "legendFormat": "{{schemaname}}.{{relname}}"
-              }
-            ]
-          },
-          {
-            "id": 8,
-            "title": "Table Bloat Estimate",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (pg_stat_user_tables_n_dead_tup * avg_tuple_size) / (pg_total_relation_size * 8192)",
-                "legendFormat": "{{schemaname}}.{{relname}} bloat %"
-              }
-            ]
-          },
-          {
-            "id": 9,
-            "title": "Replication Lag (bytes)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "pg_replication_lag_bytes",
-                "legendFormat": "{{slot_name}} - {{application_name}}"
-              }
-            ]
-          },
-          {
-            "id": 10,
-            "title": "Database Size (GB)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "pg_database_size_bytes / 1024 / 1024 / 1024",
-                "legendFormat": "{{datname}}"
-              }
-            ]
-          },
-          {
-            "id": 11,
-            "title": "Database Size Growth (per hour)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(pg_database_size_bytes[1h])",
-                "legendFormat": "{{datname}} - bytes/hour"
-              }
-            ]
-          },
-          {
-            "id": 12,
-            "title": "Lock Counts by Type",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "pg_locks_count",
-                "legendFormat": "{{datname}} - {{locktype}} - {{mode}}"
-              }
-            ]
-          },
-          {
-            "id": 13,
-            "title": "Query Duration (p95)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 40, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "histogram_quantile(0.95, rate(pg_query_duration_seconds_bucket[5m]))",
-                "legendFormat": "p95"
-              }
-            ]
-          }
-        ]
-      }
-    }
-
-  node-exporter-dashboard.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - Node Exporter Infrastructure",
-        "tags": ["bakery-ia", "node-exporter", "infrastructure"],
-        "timezone": "browser",
-        "refresh": "15s",
-        "schemaVersion": 16,
-        "version": 1,
-        "panels": [
-          {
-            "id": 1,
-            "title": "CPU Usage by Node",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
-                "legendFormat": "{{instance}} - {{cpu}}"
-              }
-            ]
-          },
-          {
-            "id": 2,
-            "title": "Average CPU Usage",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
-                "legendFormat": "Average CPU %"
-              }
-            ]
-          },
-          {
-            "id": 3,
-            "title": "CPU Load (1m, 5m, 15m)",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "avg(node_load1)",
-                "legendFormat": "1m"
-              },
-              {
-                "expr": "avg(node_load5)",
-                "legendFormat": "5m"
-              },
-              {
-                "expr": "avg(node_load15)",
-                "legendFormat": "15m"
-              }
-            ]
-          },
-          {
-            "id": 4,
-            "title": "Memory Usage by Node",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          },
-          {
-            "id": 5,
-            "title": "Memory Used (GB)",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          },
-          {
-            "id": 6,
-            "title": "Memory Available (GB)",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "node_memory_MemAvailable_bytes / 1024 / 1024 / 1024",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          },
-          {
-            "id": 7,
-            "title": "Disk I/O Read Rate (MB/s)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_disk_read_bytes_total[5m]) / 1024 / 1024",
-                "legendFormat": "{{instance}} - {{device}}"
-              }
-            ]
-          },
-          {
-            "id": 8,
-            "title": "Disk I/O Write Rate (MB/s)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_disk_written_bytes_total[5m]) / 1024 / 1024",
-                "legendFormat": "{{instance}} - {{device}}"
-              }
-            ]
-          },
-          {
-            "id": 9,
-            "title": "Disk I/O Operations (IOPS)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_disk_reads_completed_total[5m]) + rate(node_disk_writes_completed_total[5m])",
-                "legendFormat": "{{instance}} - {{device}}"
-              }
-            ]
-          },
-          {
-            "id": 10,
-            "title": "Network Receive Rate (Mbps)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
-                "legendFormat": "{{instance}} - {{device}}"
-              }
-            ]
-          },
-          {
-            "id": 11,
-            "title": "Network Transmit Rate (Mbps)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
-                "legendFormat": "{{instance}} - {{device}}"
-              }
-            ]
-          },
-          {
-            "id": 12,
-            "title": "Network Errors",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_network_receive_errs_total[5m]) + rate(node_network_transmit_errs_total[5m])",
-                "legendFormat": "{{instance}} - {{device}}"
-              }
-            ]
-          },
-          {
-            "id": 13,
-            "title": "Filesystem Usage by Mount",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes))",
-                "legendFormat": "{{instance}} - {{mountpoint}}"
-              }
-            ]
-          },
-          {
-            "id": 14,
-            "title": "Filesystem Available (GB)",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "node_filesystem_avail_bytes / 1024 / 1024 / 1024",
-                "legendFormat": "{{instance}} - {{mountpoint}}"
-              }
-            ]
-          },
-          {
-            "id": 15,
-            "title": "Filesystem Size (GB)",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "node_filesystem_size_bytes / 1024 / 1024 / 1024",
-                "legendFormat": "{{instance}} - {{mountpoint}}"
-              }
-            ]
-          },
-          {
-            "id": 16,
-            "title": "Load Average (1m, 5m, 15m)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "node_load1",
-                "legendFormat": "{{instance}} - 1m"
-              },
-              {
-                "expr": "node_load5",
-                "legendFormat": "{{instance}} - 5m"
-              },
-              {
-                "expr": "node_load15",
-                "legendFormat": "{{instance}} - 15m"
-              }
-            ]
-          },
-          {
-            "id": 17,
-            "title": "System Up Time",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "node_boot_time_seconds",
-                "legendFormat": "{{instance}} - uptime"
-              }
-            ]
-          },
-          {
-            "id": 18,
-            "title": "Context Switches",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 56, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_context_switches_total[5m])",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          },
-          {
-            "id": 19,
-            "title": "Interrupts",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 56, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_intr_total[5m])",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          }
-        ]
-      }
-    }
-
-  alertmanager-dashboard.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - AlertManager Monitoring",
-        "tags": ["bakery-ia", "alertmanager", "alerting"],
-        "timezone": "browser",
-        "refresh": "10s",
-        "schemaVersion": 16,
-        "version": 1,
-        "panels": [
-          {
-            "id": 1,
-            "title": "Active Alerts by Severity",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "count by (severity) (ALERTS{alertstate=\"firing\"})",
-                "legendFormat": "{{severity}}"
-              }
-            ]
-          },
-          {
-            "id": 2,
-            "title": "Total Active Alerts",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "count(ALERTS{alertstate=\"firing\"})",
-                "legendFormat": "Active alerts"
-              }
-            ]
-          },
-          {
-            "id": 3,
-            "title": "Critical Alerts",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "count(ALERTS{alertstate=\"firing\", severity=\"critical\"})",
-                "legendFormat": "Critical"
-              }
-            ]
-          },
-          {
-            "id": 4,
-            "title": "Alert Firing Rate (per minute)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(alertmanager_alerts_fired_total[1m])",
-                "legendFormat": "Alerts fired/min"
-              }
-            ]
-          },
-          {
-            "id": 5,
-            "title": "Alert Resolution Rate (per minute)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(alertmanager_alerts_resolved_total[1m])",
-                "legendFormat": "Alerts resolved/min"
-              }
-            ]
-          },
-          {
-            "id": 6,
-            "title": "Notification Success Rate",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (rate(alertmanager_notifications_total{status=\"success\"}[5m]) / rate(alertmanager_notifications_total[5m]))",
-                "legendFormat": "Success rate %"
-              }
-            ]
-          },
-          {
-            "id": 7,
-            "title": "Notification Failures",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(alertmanager_notifications_total{status=\"failed\"}[5m])",
-                "legendFormat": "{{integration}}"
-              }
-            ]
-          },
-          {
-            "id": 8,
-            "title": "Silenced Alerts",
-            "type": "stat",
-            "gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "count(ALERTS{alertstate=\"silenced\"})",
-                "legendFormat": "Silenced"
-              }
-            ]
-          },
-          {
-            "id": 9,
-            "title": "AlertManager Cluster Size",
-            "type": "stat",
-            "gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "count(alertmanager_cluster_peers)",
-                "legendFormat": "Cluster peers"
-              }
-            ]
-          },
-          {
-            "id": 10,
-            "title": "AlertManager Peers",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 24, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "alertmanager_cluster_peers",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          },
-          {
-            "id": 11,
-            "title": "Cluster Status",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 24, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "up{job=\"alertmanager\"}",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          },
-          {
-            "id": 12,
-            "title": "Alerts by Group",
-            "type": "table",
-            "gridPos": {"x": 0, "y": 28, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "count by (alertname) (ALERTS{alertstate=\"firing\"})",
-                "format": "table",
-                "instant": true
-              }
-            ],
-            "transformations": [
-              {
-                "id": "organize",
-                "options": {
-                  "excludeByName": {},
-                  "indexByName": {},
-                  "renameByName": {
-                    "alertname": "Alert Name",
-                    "Value": "Count"
-                  }
-                }
-              }
-            ]
-          },
-          {
-            "id": 13,
-            "title": "Alert Duration (p99)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 28, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "histogram_quantile(0.99, rate(alertmanager_alert_duration_seconds_bucket[5m]))",
-                "legendFormat": "p99 duration"
-              }
-            ]
-          },
-          {
-            "id": 14,
-            "title": "Processing Time",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 36, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(alertmanager_receiver_processing_duration_seconds_sum[5m]) / rate(alertmanager_receiver_processing_duration_seconds_count[5m])",
-                "legendFormat": "{{receiver}}"
-              }
-            ]
-          },
-          {
-            "id": 15,
-            "title": "Memory Usage",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 36, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "process_resident_memory_bytes{job=\"alertmanager\"} / 1024 / 1024",
-                "legendFormat": "{{instance}} - MB"
-              }
-            ]
-          }
-        ]
-      }
-    }
-
-  business-metrics-dashboard.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - Business Metrics & KPIs",
-        "tags": ["bakery-ia", "business-metrics", "kpis"],
-        "timezone": "browser",
-        "refresh": "30s",
-        "schemaVersion": 16,
-        "version": 1,
-        "panels": [
-          {
-            "id": 1,
-            "title": "Requests per Service (Rate)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "sum by (service) (rate(http_requests_total[5m]))",
-                "legendFormat": "{{service}}"
-              }
-            ]
-          },
-          {
-            "id": 2,
-            "title": "Total Request Rate",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "sum(rate(http_requests_total[5m]))",
-                "legendFormat": "requests/sec"
-              }
-            ]
-          },
-          {
-            "id": 3,
-            "title": "Peak Request Rate (5m)",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "max(sum(rate(http_requests_total[5m])))",
-                "legendFormat": "Peak requests/sec"
-              }
-            ]
-          },
-          {
-            "id": 4,
-            "title": "Error Rates by Service",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))",
-                "legendFormat": "{{service}}"
-              }
-            ]
-          },
-          {
-            "id": 5,
-            "title": "Overall Error Rate",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "100 * (sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])))",
-                "legendFormat": "Error %"
-              }
-            ]
-          },
-          {
-            "id": 6,
-            "title": "4xx Error Rate",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "100 * (sum(rate(http_requests_total{status_code=~\"4..\"}[5m])) / sum(rate(http_requests_total[5m])))",
-                "legendFormat": "4xx %"
-              }
-            ]
-          },
-          {
-            "id": 7,
-            "title": "P95 Latency by Service (ms)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "histogram_quantile(0.95, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
-                "legendFormat": "{{service}} p95"
-              }
-            ]
-          },
-          {
-            "id": 8,
-            "title": "P99 Latency by Service (ms)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
-                "legendFormat": "{{service}} p99"
-              }
-            ]
-          },
-          {
-            "id": 9,
-            "title": "Average Latency (ms)",
-            "type": "stat",
-            "gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "(sum(rate(http_request_duration_seconds_sum[5m])) / sum(rate(http_request_duration_seconds_count[5m]))) * 1000",
-                "legendFormat": "Avg latency ms"
-              }
-            ]
-          },
-          {
-            "id": 10,
-            "title": "Active Tenants",
-            "type": "stat",
-            "gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "count(count by (tenant_id) (rate(http_requests_total[5m])))",
-                "legendFormat": "Active tenants"
-              }
-            ]
-          },
-          {
-            "id": 11,
-            "title": "Requests per Tenant",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 24, "w": 12, "h": 4},
-            "targets": [
-              {
-                "expr": "sum by (tenant_id) (rate(http_requests_total[5m]))",
-                "legendFormat": "Tenant {{tenant_id}}"
-              }
-            ]
-          },
-          {
-            "id": 12,
-            "title": "Alert Generation Rate (per minute)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(ALERTS_FOR_STATE[1m])",
-                "legendFormat": "{{alertname}}"
-              }
-            ]
-          },
-          {
-            "id": 13,
-            "title": "Training Job Success Rate",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (sum(training_job_completed_total{status=\"success\"}) / sum(training_job_completed_total))",
-                "legendFormat": "Success rate %"
-              }
-            ]
-          },
-          {
-            "id": 14,
-            "title": "Training Jobs in Progress",
-            "type": "stat",
-            "gridPos": {"x": 0, "y": 40, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "count(training_job_in_progress)",
-                "legendFormat": "Jobs running"
-              }
-            ]
-          },
-          {
-            "id": 15,
-            "title": "Training Job Completion Time (p95, minutes)",
-            "type": "stat",
-            "gridPos": {"x": 6, "y": 40, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "histogram_quantile(0.95, training_job_duration_seconds) / 60",
-                "legendFormat": "p95 minutes"
-              }
-            ]
-          },
-          {
-            "id": 16,
-            "title": "Failed Training Jobs",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "sum(training_job_completed_total{status=\"failed\"})",
-                "legendFormat": "Failed jobs"
-              }
-            ]
-          },
-          {
-            "id": 17,
-            "title": "Total Training Jobs Completed",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "sum(training_job_completed_total)",
-                "legendFormat": "Total completed"
-              }
-            ]
-          },
-          {
-            "id": 18,
-            "title": "API Health Status",
-            "type": "table",
-            "gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "up{job=\"bakery-services\"}",
-                "format": "table",
-                "instant": true
-              }
-            ],
-            "transformations": [
-              {
-                "id": "organize",
-                "options": {
-                  "excludeByName": {},
-                  "indexByName": {},
-                  "renameByName": {
-                    "service": "Service",
-                    "Value": "Status",
-                    "instance": "Instance"
-                  }
-                }
-              }
-            ]
-          },
-          {
-            "id": 19,
-            "title": "Service Success Rate (%)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (1 - (sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum by (service) (rate(http_requests_total[5m]))))",
-                "legendFormat": "{{service}}"
-              }
-            ]
-          },
-          {
-            "id": 20,
-            "title": "Requests Processed Today",
-            "type": "stat",
-            "gridPos": {"x": 0, "y": 56, "w": 12, "h": 4},
-            "targets": [
-              {
-                "expr": "sum(increase(http_requests_total[24h]))",
-                "legendFormat": "Requests (24h)"
-              }
-            ]
-          },
-          {
-            "id": 21,
-            "title": "Distinct Users Today",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 56, "w": 12, "h": 4},
-            "targets": [
-              {
-                "expr": "count(count by (user_id) (increase(http_requests_total{user_id!=\"\"}[24h])))",
-                "legendFormat": "Users (24h)"
-              }
-            ]
-          }
-        ]
-      }
-    }
--- a/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards.yaml
@@ -1,177 +0,0 @@
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: grafana-dashboards
-  namespace: monitoring
-data:
-  gateway-metrics.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - Gateway Metrics",
-        "tags": ["bakery-ia", "gateway"],
-        "timezone": "browser",
-        "panels": [
-          {
-            "id": 1,
-            "title": "Request Rate by Endpoint",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "rate(http_requests_total{service=\"gateway\"}[5m])",
-              "legendFormat": "{{method}} {{endpoint}}"
-            }]
-          },
-          {
-            "id": 2,
-            "title": "P95 Request Latency",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"gateway\"}[5m]))",
-              "legendFormat": "{{endpoint}} p95"
-            }]
-          },
-          {
-            "id": 3,
-            "title": "Error Rate (5xx)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "rate(http_requests_total{service=\"gateway\",status_code=~\"5..\"}[5m])",
-              "legendFormat": "{{endpoint}} errors"
-            }]
-          },
-          {
-            "id": 4,
-            "title": "Active Requests",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
-            "targets": [{
-              "expr": "sum(rate(http_requests_total{service=\"gateway\"}[1m]))"
-            }]
-          },
-          {
-            "id": 5,
-            "title": "Authentication Success Rate",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
-            "targets": [{
-              "expr": "rate(gateway_auth_responses_total[5m]) / rate(gateway_auth_requests_total[5m]) * 100"
-            }]
-          }
-        ],
-        "refresh": "10s",
-        "schemaVersion": 16,
-        "version": 1
-      }
-    }
-
-  services-overview.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - Services Overview",
-        "tags": ["bakery-ia", "services"],
-        "timezone": "browser",
-        "panels": [
-          {
-            "id": 1,
-            "title": "Request Rate by Service",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "sum by (service) (rate(http_requests_total[5m]))",
-              "legendFormat": "{{service}}"
-            }]
-          },
-          {
-            "id": 2,
-            "title": "P99 Latency by Service",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m])))",
-              "legendFormat": "{{service}} p99"
-            }]
-          },
-          {
-            "id": 3,
-            "title": "Error Rate by Service",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 8, "w": 24, "h": 8},
-            "targets": [{
-              "expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))",
-              "legendFormat": "{{service}}"
-            }]
-          },
-          {
-            "id": 4,
-            "title": "Service Health Status",
-            "type": "table",
-            "gridPos": {"x": 0, "y": 16, "w": 24, "h": 8},
-            "targets": [{
-              "expr": "up{job=\"bakery-services\"}",
-              "format": "table",
-              "instant": true
-            }],
-            "transformations": [{
-              "id": "organize",
-              "options": {
-                "excludeByName": {},
-                "indexByName": {},
-                "renameByName": {
-                  "service": "Service Name",
-                  "Value": "Status"
-                }
-              }
-            }]
-          }
-        ],
-        "refresh": "30s",
-        "schemaVersion": 16,
-        "version": 1
-      }
-    }
-
-  circuit-breakers.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - Circuit Breakers",
-        "tags": ["bakery-ia", "reliability"],
-        "timezone": "browser",
-        "panels": [
-          {
-            "id": 1,
-            "title": "Circuit Breaker States",
-            "type": "stat",
-            "gridPos": {"x": 0, "y": 0, "w": 24, "h": 4},
-            "targets": [{
-              "expr": "circuit_breaker_state",
-              "legendFormat": "{{service}} - {{state}}"
-            }]
-          },
-          {
-            "id": 2,
-            "title": "Circuit Breaker Trips",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 4, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "rate(circuit_breaker_opened_total[5m])",
-              "legendFormat": "{{service}}"
-            }]
-          },
-          {
-            "id": 3,
-            "title": "Rejected Requests",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 4, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "rate(circuit_breaker_rejected_total[5m])",
-              "legendFormat": "{{service}}"
-            }]
-          }
-        ],
-        "refresh": "10s",
-        "schemaVersion": 16,
-        "version": 1
-      }
-    }
--- a/infrastructure/kubernetes/base/components/monitoring/grafana.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/grafana.yaml
@@ -1,166 +0,0 @@
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: grafana-datasources
-  namespace: monitoring
-data:
-  prometheus.yaml: |
-    apiVersion: 1
-    datasources:
-    - name: Prometheus
-      type: prometheus
-      access: proxy
-      url: http://prometheus:9090
-      isDefault: true
-      editable: false
-
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: grafana-dashboards-config
-  namespace: monitoring
-data:
-  dashboards.yaml: |
-    apiVersion: 1
-    providers:
-    - name: 'default'
-      orgId: 1
-      folder: 'Bakery IA'
-      type: file
-      disableDeletion: false
-      updateIntervalSeconds: 10
-      allowUiUpdates: true
-      options:
-        path: /var/lib/grafana/dashboards
-    - name: 'extended'
-      orgId: 1
-      folder: 'Bakery IA - Extended'
-      type: file
-      disableDeletion: false
-      updateIntervalSeconds: 10
-      allowUiUpdates: true
-      options:
-        path: /var/lib/grafana/dashboards-extended
-
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: grafana
-  namespace: monitoring
-  labels:
-    app: grafana
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: grafana
-  template:
-    metadata:
-      labels:
-        app: grafana
-    spec:
-      containers:
-      - name: grafana
-        image: grafana/grafana:12.3.0
-        ports:
-        - containerPort: 3000
-          name: http
-        env:
-        - name: GF_SECURITY_ADMIN_USER
-          valueFrom:
-            secretKeyRef:
-              name: grafana-admin
-              key: admin-user
-        - name: GF_SECURITY_ADMIN_PASSWORD
-          valueFrom:
-            secretKeyRef:
-              name: grafana-admin
-              key: admin-password
-        - name: GF_SERVER_ROOT_URL
-          value: "http://monitoring.bakery-ia.local/grafana"
-        - name: GF_SERVER_SERVE_FROM_SUB_PATH
-          value: "true"
-        - name: GF_AUTH_ANONYMOUS_ENABLED
-          value: "false"
-        - name: GF_INSTALL_PLUGINS
-          value: ""
-        volumeMounts:
-        - name: grafana-storage
-          mountPath: /var/lib/grafana
-        - name: grafana-datasources
-          mountPath: /etc/grafana/provisioning/datasources
-        - name: grafana-dashboards-config
-          mountPath: /etc/grafana/provisioning/dashboards
-        - name: grafana-dashboards
-          mountPath: /var/lib/grafana/dashboards
-        - name: grafana-dashboards-extended
-          mountPath: /var/lib/grafana/dashboards-extended
-        resources:
-          requests:
-            memory: "256Mi"
-            cpu: "100m"
-          limits:
-            memory: "512Mi"
-            cpu: "500m"
-        livenessProbe:
-          httpGet:
-            path: /api/health
-            port: 3000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /api/health
-            port: 3000
-          initialDelaySeconds: 5
-          periodSeconds: 5
-      volumes:
-      - name: grafana-storage
-        persistentVolumeClaim:
-          claimName: grafana-storage
-      - name: grafana-datasources
-        configMap:
-          name: grafana-datasources
-      - name: grafana-dashboards-config
-        configMap:
-          name: grafana-dashboards-config
-      - name: grafana-dashboards
-        configMap:
-          name: grafana-dashboards
-      - name: grafana-dashboards-extended
-        configMap:
-          name: grafana-dashboards-extended
-
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: grafana-storage
-  namespace: monitoring
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 5Gi
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: grafana
-  namespace: monitoring
-  labels:
-    app: grafana
-spec:
-  type: ClusterIP
-  ports:
-  - port: 3000
-    targetPort: 3000
-    protocol: TCP
-    name: http
-  selector:
-    app: grafana
--- a/infrastructure/kubernetes/base/components/monitoring/ha-policies.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/ha-policies.yaml
@@ -1,100 +0,0 @@
---
-# PodDisruptionBudgets ensure minimum availability during voluntary disruptions
-# (node drains, rolling updates, etc.)
-
-apiVersion: policy/v1
-kind: PodDisruptionBudget
-metadata:
-  name: prometheus-pdb
-  namespace: monitoring
-spec:
-  minAvailable: 1
-  selector:
-    matchLabels:
-      app: prometheus
-
---
-apiVersion: policy/v1
-kind: PodDisruptionBudget
-metadata:
-  name: alertmanager-pdb
-  namespace: monitoring
-spec:
-  minAvailable: 2
-  selector:
-    matchLabels:
-      app: alertmanager
-
---
-apiVersion: policy/v1
-kind: PodDisruptionBudget
-metadata:
-  name: grafana-pdb
-  namespace: monitoring
-spec:
-  minAvailable: 1
-  selector:
-    matchLabels:
-      app: grafana
-
---
-# ResourceQuota limits total resources in monitoring namespace
-apiVersion: v1
-kind: ResourceQuota
-metadata:
-  name: monitoring-quota
-  namespace: monitoring
-spec:
-  hard:
-    # Compute resources
-    requests.cpu: "10"
-    requests.memory: "16Gi"
-    limits.cpu: "20"
-    limits.memory: "32Gi"
-
-    # Storage
-    persistentvolumeclaims: "10"
-    requests.storage: "100Gi"
-
-    # Object counts
-    pods: "50"
-    services: "20"
-    configmaps: "30"
-    secrets: "20"
-
---
-# LimitRange sets default resource limits for pods in monitoring namespace
-apiVersion: v1
-kind: LimitRange
-metadata:
-  name: monitoring-limits
-  namespace: monitoring
-spec:
-  limits:
-  # Default container limits
-  - max:
-      cpu: "2"
-      memory: "4Gi"
-    min:
-      cpu: "10m"
-      memory: "16Mi"
-    default:
-      cpu: "500m"
-      memory: "512Mi"
-    defaultRequest:
-      cpu: "100m"
-      memory: "128Mi"
-    type: Container
-
-  # Pod limits
-  - max:
-      cpu: "4"
-      memory: "8Gi"
-    type: Pod
-
-  # PVC limits
-  - max:
-      storage: "50Gi"
-    min:
-      storage: "1Gi"
-    type: PersistentVolumeClaim
--- a/infrastructure/kubernetes/base/components/monitoring/ingress.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/ingress.yaml
@@ -1,42 +0,0 @@
---
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: monitoring-ingress
-  namespace: monitoring
-  annotations:
-    nginx.ingress.kubernetes.io/rewrite-target: /$2
-    nginx.ingress.kubernetes.io/ssl-redirect: "false"
-spec:
-  rules:
-  - host: monitoring.bakery-ia.local
-    http:
-      paths:
-      - path: /grafana(/|$)(.*)
-        pathType: ImplementationSpecific
-        backend:
-          service:
-            name: grafana
-            port:
-              number: 3000
-      - path: /prometheus(/|$)(.*)
-        pathType: ImplementationSpecific
-        backend:
-          service:
-            name: prometheus-external
-            port:
-              number: 9090
-      - path: /jaeger(/|$)(.*)
-        pathType: ImplementationSpecific
-        backend:
-          service:
-            name: jaeger-query
-            port:
-              number: 16686
-      - path: /alertmanager(/|$)(.*)
-        pathType: ImplementationSpecific
-        backend:
-          service:
-            name: alertmanager-external
-            port:
-              number: 9093
--- a/infrastructure/kubernetes/base/components/monitoring/jaeger.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/jaeger.yaml
@@ -1,190 +0,0 @@
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: jaeger
-  namespace: monitoring
-  labels:
-    app: jaeger
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: jaeger
-  template:
-    metadata:
-      labels:
-        app: jaeger
-    spec:
-      containers:
-      - name: jaeger
-        image: jaegertracing/all-in-one:1.51
-        env:
-        - name: COLLECTOR_ZIPKIN_HOST_PORT
-          value: ":9411"
-        - name: COLLECTOR_OTLP_ENABLED
-          value: "true"
-        - name: SPAN_STORAGE_TYPE
-          value: "badger"
-        - name: BADGER_EPHEMERAL
-          value: "false"
-        - name: BADGER_DIRECTORY_VALUE
-          value: "/badger/data"
-        - name: BADGER_DIRECTORY_KEY
-          value: "/badger/key"
-        ports:
-        - containerPort: 5775
-          protocol: UDP
-          name: zipkin-compact
-        - containerPort: 6831
-          protocol: UDP
-          name: jaeger-compact
-        - containerPort: 6832
-          protocol: UDP
-          name: jaeger-binary
-        - containerPort: 5778
-          protocol: TCP
-          name: config-rest
-        - containerPort: 16686
-          protocol: TCP
-          name: query
-        - containerPort: 14250
-          protocol: TCP
-          name: grpc
-        - containerPort: 14268
-          protocol: TCP
-          name: c-tchan-trft
-        - containerPort: 14269
-          protocol: TCP
-          name: admin-http
-        - containerPort: 9411
-          protocol: TCP
-          name: zipkin
-        - containerPort: 4317
-          protocol: TCP
-          name: otlp-grpc
-        - containerPort: 4318
-          protocol: TCP
-          name: otlp-http
-        volumeMounts:
-        - name: jaeger-storage
-          mountPath: /badger
-        resources:
-          requests:
-            memory: "512Mi"
-            cpu: "250m"
-          limits:
-            memory: "1Gi"
-            cpu: "500m"
-        livenessProbe:
-          httpGet:
-            path: /
-            port: 14269
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /
-            port: 14269
-          initialDelaySeconds: 5
-          periodSeconds: 5
-      volumes:
-      - name: jaeger-storage
-        persistentVolumeClaim:
-          claimName: jaeger-storage
-
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: jaeger-storage
-  namespace: monitoring
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 10Gi
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: jaeger-query
-  namespace: monitoring
-  labels:
-    app: jaeger
-spec:
-  type: ClusterIP
-  ports:
-  - port: 16686
-    targetPort: 16686
-    protocol: TCP
-    name: query
-  selector:
-    app: jaeger
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: jaeger-collector
-  namespace: monitoring
-  labels:
-    app: jaeger
-spec:
-  type: ClusterIP
-  ports:
-  - port: 14268
-    targetPort: 14268
-    protocol: TCP
-    name: c-tchan-trft
-  - port: 14250
-    targetPort: 14250
-    protocol: TCP
-    name: grpc
-  - port: 9411
-    targetPort: 9411
-    protocol: TCP
-    name: zipkin
-  - port: 4317
-    targetPort: 4317
-    protocol: TCP
-    name: otlp-grpc
-  - port: 4318
-    targetPort: 4318
-    protocol: TCP
-    name: otlp-http
-  selector:
-    app: jaeger
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: jaeger-agent
-  namespace: monitoring
-  labels:
-    app: jaeger
-spec:
-  type: ClusterIP
-  clusterIP: None
-  ports:
-  - port: 5775
-    targetPort: 5775
-    protocol: UDP
-    name: zipkin-compact
-  - port: 6831
-    targetPort: 6831
-    protocol: UDP
-    name: jaeger-compact
-  - port: 6832
-    targetPort: 6832
-    protocol: UDP
-    name: jaeger-binary
-  - port: 5778
-    targetPort: 5778
-    protocol: TCP
-    name: config-rest
-  selector:
-    app: jaeger
--- a/infrastructure/kubernetes/base/components/monitoring/kustomization.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/kustomization.yaml
@@ -1,18 +1,20 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization

+# Minimal Monitoring Infrastructure
+# SigNoz is now managed via Helm in the 'signoz' namespace
+# This kustomization only maintains:
+# - Namespace for legacy resources (if needed)
+# - Node exporter for infrastructure metrics
+# - PostgreSQL exporter for database metrics
+# - Optional OTEL collector (can be disabled if using SigNoz's built-in collector)
+
 resources:
  - namespace.yaml
  - secrets.yaml
-  - prometheus.yaml
-  - alert-rules.yaml
-  - alertmanager.yaml
-  - alertmanager-init.yaml
-  - grafana.yaml
-  - grafana-dashboards.yaml
-  - grafana-dashboards-extended.yaml
-  - postgres-exporter.yaml
+  # Exporters for metrics collection
  - node-exporter.yaml
-  - jaeger.yaml
-  - ha-policies.yaml
-  - ingress.yaml
+  - postgres-exporter.yaml
+  # Optional: Keep OTEL collector or use SigNoz's built-in one
+  # Uncomment if you want a dedicated OTEL collector in monitoring namespace
+  # - otel-collector.yaml
--- a/infrastructure/kubernetes/base/components/monitoring/otel-collector.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/otel-collector.yaml
@@ -0,0 +1,167 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: otel-collector-config
+  namespace: monitoring
+data:
+  otel-collector-config.yaml: |
+    extensions:
+      health_check:
+        endpoint: 0.0.0.0:13133
+
+    receivers:
+      otlp:
+        protocols:
+          grpc:
+            endpoint: 0.0.0.0:4317
+          http:
+            endpoint: 0.0.0.0:4318
+
+    processors:
+      batch:
+        timeout: 10s
+        send_batch_size: 1024
+
+      # Memory limiter to prevent OOM
+      memory_limiter:
+        check_interval: 1s
+        limit_mib: 512
+        spike_limit_mib: 128
+
+    exporters:
+      # Export metrics to Prometheus
+      prometheus:
+        endpoint: "0.0.0.0:8889"
+        namespace: otelcol
+        const_labels:
+          source: otel-collector
+
+      # Export to SigNoz
+      otlp/signoz:
+        endpoint: "signoz-query-service.monitoring.svc.cluster.local:8080"
+        tls:
+          insecure: true
+
+      # Logging exporter for debugging traces and logs
+      logging:
+        loglevel: info
+        sampling_initial: 5
+        sampling_thereafter: 200
+
+    service:
+      extensions: [health_check]
+      pipelines:
+        # Traces pipeline: receive -> process -> export to SigNoz
+        traces:
+          receivers: [otlp]
+          processors: [memory_limiter, batch]
+          exporters: [otlp/signoz, logging]
+
+        # Metrics pipeline: receive -> process -> export to both Prometheus and SigNoz
+        metrics:
+          receivers: [otlp]
+          processors: [memory_limiter, batch]
+          exporters: [prometheus, otlp/signoz]
+
+        # Logs pipeline: receive -> process -> export to SigNoz
+        logs:
+          receivers: [otlp]
+          processors: [memory_limiter, batch]
+          exporters: [otlp/signoz, logging]
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: otel-collector
+  namespace: monitoring
+  labels:
+    app: otel-collector
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: otel-collector
+  template:
+    metadata:
+      labels:
+        app: otel-collector
+    spec:
+      containers:
+      - name: otel-collector
+        image: otel/opentelemetry-collector-contrib:0.91.0
+        args:
+        - --config=/conf/otel-collector-config.yaml
+        ports:
+        - containerPort: 4317
+          protocol: TCP
+          name: otlp-grpc
+        - containerPort: 4318
+          protocol: TCP
+          name: otlp-http
+        - containerPort: 8889
+          protocol: TCP
+          name: prometheus
+        - containerPort: 13133
+          protocol: TCP
+          name: health-check
+        volumeMounts:
+        - name: otel-collector-config
+          mountPath: /conf
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "100m"
+          limits:
+            memory: "512Mi"
+            cpu: "500m"
+        livenessProbe:
+          httpGet:
+            path: /
+            port: 13133
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /
+            port: 13133
+          initialDelaySeconds: 5
+          periodSeconds: 5
+      volumes:
+      - name: otel-collector-config
+        configMap:
+          name: otel-collector-config
+          items:
+          - key: otel-collector-config.yaml
+            path: otel-collector-config.yaml
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: otel-collector
+  namespace: monitoring
+  labels:
+    app: otel-collector
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8889"
+    prometheus.io/path: "/metrics"
+spec:
+  type: ClusterIP
+  ports:
+  - port: 4317
+    targetPort: 4317
+    protocol: TCP
+    name: otlp-grpc
+  - port: 4318
+    targetPort: 4318
+    protocol: TCP
+    name: otlp-http
+  - port: 8889
+    targetPort: 8889
+    protocol: TCP
+    name: prometheus
+  selector:
+    app: otel-collector
--- a/infrastructure/kubernetes/base/components/monitoring/prometheus.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/prometheus.yaml
@@ -1,278 +0,0 @@
---
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: prometheus
-  namespace: monitoring
-
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
-  name: prometheus
-rules:
- apiGroups: [""]
-  resources:
-  - nodes
-  - nodes/proxy
-  - services
-  - endpoints
-  - pods
-  verbs: ["get", "list", "watch"]
- apiGroups:
-  - extensions
-  resources:
-  - ingresses
-  verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
-  verbs: ["get"]
-
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: prometheus
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: prometheus
-subjects:
- kind: ServiceAccount
-  name: prometheus
-  namespace: monitoring
-
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: prometheus-config
-  namespace: monitoring
-data:
-  prometheus.yml: |
-    global:
-      scrape_interval: 30s
-      evaluation_interval: 30s
-      external_labels:
-        cluster: 'bakery-ia'
-        environment: 'production'
-
-    # AlertManager configuration
-    alerting:
-      alertmanagers:
-      - static_configs:
-        - targets:
-          - alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093
-          - alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093
-          - alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093
-
-    # Load alert rules
-    rule_files:
-      - '/etc/prometheus/rules/*.yml'
-
-    scrape_configs:
-      # Scrape Prometheus itself
-      - job_name: 'prometheus'
-        static_configs:
-          - targets: ['localhost:9090']
-
-      # Scrape all bakery-ia services
-      - job_name: 'bakery-services'
-        kubernetes_sd_configs:
-          - role: pod
-            namespaces:
-              names:
-                - bakery-ia
-        relabel_configs:
-          # Only scrape pods with metrics port
-          - source_labels: [__meta_kubernetes_pod_container_port_name]
-            action: keep
-            regex: http
-
-          # Add service name label
-          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
-            target_label: service
-
-          # Add component label
-          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
-            target_label: component
-
-          # Add pod name
-          - source_labels: [__meta_kubernetes_pod_name]
-            target_label: pod
-
-          # Set metrics path
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
-            action: replace
-            target_label: __metrics_path__
-            regex: (.+)
-
-          # Set scrape port
-          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
-            action: replace
-            regex: ([^:]+)(?::\d+)?;(\d+)
-            replacement: $1:$2
-            target_label: __address__
-
-      # Scrape Kubernetes nodes
-      - job_name: 'kubernetes-nodes'
-        kubernetes_sd_configs:
-          - role: node
-        relabel_configs:
-          - action: labelmap
-            regex: __meta_kubernetes_node_label_(.+)
-          - target_label: __address__
-            replacement: kubernetes.default.svc:443
-          - source_labels: [__meta_kubernetes_node_name]
-            regex: (.+)
-            target_label: __metrics_path__
-            replacement: /api/v1/nodes/${1}/proxy/metrics
-
-      # Scrape AlertManager
-      - job_name: 'alertmanager'
-        static_configs:
-          - targets:
-            - alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093
-            - alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093
-            - alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093
-
-      # Scrape PostgreSQL exporter
-      - job_name: 'postgres-exporter'
-        static_configs:
-          - targets: ['postgres-exporter.monitoring.svc.cluster.local:9187']
-
-      # Scrape Node Exporter
-      - job_name: 'node-exporter'
-        kubernetes_sd_configs:
-          - role: node
-        relabel_configs:
-          - source_labels: [__address__]
-            regex: '(.*):10250'
-            replacement: '${1}:9100'
-            target_label: __address__
-          - source_labels: [__meta_kubernetes_node_name]
-            target_label: node
-
---
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
-  name: prometheus
-  namespace: monitoring
-  labels:
-    app: prometheus
-spec:
-  serviceName: prometheus
-  replicas: 2
-  selector:
-    matchLabels:
-      app: prometheus
-  template:
-    metadata:
-      labels:
-        app: prometheus
-    spec:
-      serviceAccountName: prometheus
-      affinity:
-        podAntiAffinity:
-          preferredDuringSchedulingIgnoredDuringExecution:
-          - weight: 100
-            podAffinityTerm:
-              labelSelector:
-                matchExpressions:
-                - key: app
-                  operator: In
-                  values:
-                  - prometheus
-              topologyKey: kubernetes.io/hostname
-      containers:
-      - name: prometheus
-        image: prom/prometheus:v3.0.1
-        args:
-          - '--config.file=/etc/prometheus/prometheus.yml'
-          - '--storage.tsdb.path=/prometheus'
-          - '--storage.tsdb.retention.time=30d'
-          - '--web.console.libraries=/usr/share/prometheus/console_libraries'
-          - '--web.console.templates=/usr/share/prometheus/consoles'
-          - '--web.enable-lifecycle'
-        ports:
-        - containerPort: 9090
-          name: web
-        volumeMounts:
-        - name: prometheus-config
-          mountPath: /etc/prometheus
-        - name: prometheus-rules
-          mountPath: /etc/prometheus/rules
-        - name: prometheus-storage
-          mountPath: /prometheus
-        resources:
-          requests:
-            memory: "1Gi"
-            cpu: "500m"
-          limits:
-            memory: "2Gi"
-            cpu: "1"
-        livenessProbe:
-          httpGet:
-            path: /-/healthy
-            port: 9090
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /-/ready
-            port: 9090
-          initialDelaySeconds: 5
-          periodSeconds: 5
-      volumes:
-      - name: prometheus-config
-        configMap:
-          name: prometheus-config
-      - name: prometheus-rules
-        configMap:
-          name: prometheus-alert-rules
-
-  volumeClaimTemplates:
-  - metadata:
-      name: prometheus-storage
-    spec:
-      accessModes: [ "ReadWriteOnce" ]
-      resources:
-        requests:
-          storage: 20Gi
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: prometheus
-  namespace: monitoring
-  labels:
-    app: prometheus
-spec:
-  type: ClusterIP
-  clusterIP: None
-  ports:
-  - port: 9090
-    targetPort: 9090
-    protocol: TCP
-    name: web
-  selector:
-    app: prometheus
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: prometheus-external
-  namespace: monitoring
-  labels:
-    app: prometheus
-spec:
-  type: ClusterIP
-  ports:
-  - port: 9090
-    targetPort: 9090
-    protocol: TCP
-    name: web
-  selector:
-    app: prometheus