Add signoz

2026-01-08 12:58:00 +01:00
parent 07178f8972
commit dfb7e4b237
40 changed files with 2049 additions and 3935 deletions
--- a/146
+++ b/146
@@ -36,6 +36,11 @@ Security Features:
  ✅ pgcrypto extension for encryption
  ✅ PostgreSQL audit logging

+Monitoring:
+  📊 Service metrics available at /metrics endpoints
+  🔍 Telemetry ready (traces, metrics, logs)
+  ℹ️  SigNoz deployment optional for local dev (see signoz-info resource)
+
 Applying security configurations...
 """)

@@ -303,82 +308,131 @@ k8s_resource('redis', resource_deps=['security-setup'], labels=['01-infrastructu
 k8s_resource('rabbitmq', labels=['01-infrastructure'])
 k8s_resource('nominatim', labels=['01-infrastructure'])

+# =============================================================================
+# MONITORING RESOURCES - SigNoz (Unified Observability)
+# =============================================================================
+
+# Note: SigNoz Helm chart is complex for local dev
+# For development, access SigNoz manually or use production Helm deployment
+# To deploy SigNoz manually: ./infrastructure/helm/deploy-signoz.sh dev
+local_resource(
+    'signoz-info',
+    cmd='''
+        echo "📊 SigNoz Monitoring Information"
+        echo ""
+        echo "SigNoz Helm deployment is disabled for local development due to complexity."
+        echo ""
+        echo "Options:"
+        echo "1. Deploy manually: ./infrastructure/helm/deploy-signoz.sh dev"
+        echo "2. Use production deployment: ./infrastructure/helm/deploy-signoz.sh prod"
+        echo "3. Skip monitoring for local development (use application metrics only)"
+        echo ""
+        echo "For simpler local monitoring, consider using just Prometheus+Grafana"
+        echo "or access metrics directly from services at /metrics endpoints."
+    ''',
+    labels=['05-monitoring'],
+    auto_init=False,
+    trigger_mode=TRIGGER_MODE_MANUAL
+)
+
+# SigNoz ingress (only if manually deployed)
+# Uncomment and trigger manually if you deploy SigNoz
+# local_resource(
+#     'signoz-ingress',
+#     cmd='''
+#         echo "🌐 Applying SigNoz ingress..."
+#         kubectl apply -f infrastructure/kubernetes/overlays/dev/signoz-ingress.yaml
+#         echo "✅ SigNoz ingress configured"
+#     ''',
+#     labels=['05-monitoring'],
+#     auto_init=False,
+#     trigger_mode=TRIGGER_MODE_MANUAL
+# )
+
+# Note: SigNoz components are managed by Helm and deployed outside of kustomize
+# They will appear automatically once deployed, but we don't track them explicitly in Tilt
+# to avoid startup errors. View them with: kubectl get pods -n signoz
+
+# Optional exporters (in monitoring namespace)
+k8s_resource('node-exporter', labels=['05-monitoring'])
+k8s_resource('postgres-exporter', resource_deps=['auth-db'], labels=['05-monitoring'])
+
 # =============================================================================
 # DATABASE RESOURCES
 # =============================================================================

 # Core Service Databases
-k8s_resource('auth-db', resource_deps=['security-setup'], labels=['02-databases'])
-k8s_resource('tenant-db', resource_deps=['security-setup'], labels=['02-databases'])
+k8s_resource('auth-db', resource_deps=['security-setup'], labels=['06-databases'])
+k8s_resource('tenant-db', resource_deps=['security-setup'], labels=['06-databases'])

 # Data & Analytics Databases
-k8s_resource('training-db', resource_deps=['security-setup'], labels=['02-databases'])
-k8s_resource('forecasting-db', resource_deps=['security-setup'], labels=['02-databases'])
-k8s_resource('ai-insights-db', resource_deps=['security-setup'], labels=['02-databases'])
+k8s_resource('training-db', resource_deps=['security-setup'], labels=['06-databases'])
+k8s_resource('forecasting-db', resource_deps=['security-setup'], labels=['06-databases'])
+k8s_resource('ai-insights-db', resource_deps=['security-setup'], labels=['06-databases'])

 # Operations Databases
-k8s_resource('sales-db', resource_deps=['security-setup'], labels=['02-databases'])
-k8s_resource('inventory-db', resource_deps=['security-setup'], labels=['02-databases'])
-k8s_resource('production-db', resource_deps=['security-setup'], labels=['02-databases'])
-k8s_resource('procurement-db', resource_deps=['security-setup'], labels=['02-databases'])
-k8s_resource('distribution-db', resource_deps=['security-setup'], labels=['02-databases'])
+k8s_resource('sales-db', resource_deps=['security-setup'], labels=['06-databases'])
+k8s_resource('inventory-db', resource_deps=['security-setup'], labels=['06-databases'])
+k8s_resource('production-db', resource_deps=['security-setup'], labels=['06-databases'])
+k8s_resource('procurement-db', resource_deps=['security-setup'], labels=['06-databases'])
+k8s_resource('distribution-db', resource_deps=['security-setup'], labels=['06-databases'])

 # Supporting Service Databases
-k8s_resource('recipes-db', resource_deps=['security-setup'], labels=['02-databases'])
-k8s_resource('suppliers-db', resource_deps=['security-setup'], labels=['02-databases'])
-k8s_resource('pos-db', resource_deps=['security-setup'], labels=['02-databases'])
-k8s_resource('orders-db', resource_deps=['security-setup'], labels=['02-databases'])
-k8s_resource('external-db', resource_deps=['security-setup'], labels=['02-databases'])
+k8s_resource('recipes-db', resource_deps=['security-setup'], labels=['06-databases'])
+k8s_resource('suppliers-db', resource_deps=['security-setup'], labels=['06-databases'])
+k8s_resource('pos-db', resource_deps=['security-setup'], labels=['06-databases'])
+k8s_resource('orders-db', resource_deps=['security-setup'], labels=['06-databases'])
+k8s_resource('external-db', resource_deps=['security-setup'], labels=['06-databases'])

 # Platform Service Databases
-k8s_resource('notification-db', resource_deps=['security-setup'], labels=['02-databases'])
-k8s_resource('alert-processor-db', resource_deps=['security-setup'], labels=['02-databases'])
-k8s_resource('orchestrator-db', resource_deps=['security-setup'], labels=['02-databases'])
+k8s_resource('notification-db', resource_deps=['security-setup'], labels=['06-databases'])
+k8s_resource('alert-processor-db', resource_deps=['security-setup'], labels=['06-databases'])
+k8s_resource('orchestrator-db', resource_deps=['security-setup'], labels=['06-databases'])

 # Demo Service Databases
-k8s_resource('demo-session-db', resource_deps=['security-setup'], labels=['02-databases'])
+k8s_resource('demo-session-db', resource_deps=['security-setup'], labels=['06-databases'])

 # =============================================================================
 # MIGRATION JOBS
 # =============================================================================

 # Core Service Migrations
-k8s_resource('auth-migration', resource_deps=['auth-db'], labels=['03-migrations'])
-k8s_resource('tenant-migration', resource_deps=['tenant-db'], labels=['03-migrations'])
+k8s_resource('auth-migration', resource_deps=['auth-db'], labels=['07-migrations'])
+k8s_resource('tenant-migration', resource_deps=['tenant-db'], labels=['07-migrations'])

 # Data & Analytics Migrations
-k8s_resource('training-migration', resource_deps=['training-db'], labels=['03-migrations'])
-k8s_resource('forecasting-migration', resource_deps=['forecasting-db'], labels=['03-migrations'])
-k8s_resource('ai-insights-migration', resource_deps=['ai-insights-db'], labels=['03-migrations'])
+k8s_resource('training-migration', resource_deps=['training-db'], labels=['07-migrations'])
+k8s_resource('forecasting-migration', resource_deps=['forecasting-db'], labels=['07-migrations'])
+k8s_resource('ai-insights-migration', resource_deps=['ai-insights-db'], labels=['07-migrations'])

 # Operations Migrations
-k8s_resource('sales-migration', resource_deps=['sales-db'], labels=['03-migrations'])
-k8s_resource('inventory-migration', resource_deps=['inventory-db'], labels=['03-migrations'])
-k8s_resource('production-migration', resource_deps=['production-db'], labels=['03-migrations'])
-k8s_resource('procurement-migration', resource_deps=['procurement-db'], labels=['03-migrations'])
-k8s_resource('distribution-migration', resource_deps=['distribution-db'], labels=['03-migrations'])
+k8s_resource('sales-migration', resource_deps=['sales-db'], labels=['07-migrations'])
+k8s_resource('inventory-migration', resource_deps=['inventory-db'], labels=['07-migrations'])
+k8s_resource('production-migration', resource_deps=['production-db'], labels=['07-migrations'])
+k8s_resource('procurement-migration', resource_deps=['procurement-db'], labels=['07-migrations'])
+k8s_resource('distribution-migration', resource_deps=['distribution-db'], labels=['07-migrations'])

 # Supporting Service Migrations
-k8s_resource('recipes-migration', resource_deps=['recipes-db'], labels=['03-migrations'])
-k8s_resource('suppliers-migration', resource_deps=['suppliers-db'], labels=['03-migrations'])
-k8s_resource('pos-migration', resource_deps=['pos-db'], labels=['03-migrations'])
-k8s_resource('orders-migration', resource_deps=['orders-db'], labels=['03-migrations'])
-k8s_resource('external-migration', resource_deps=['external-db'], labels=['03-migrations'])
+k8s_resource('recipes-migration', resource_deps=['recipes-db'], labels=['07-migrations'])
+k8s_resource('suppliers-migration', resource_deps=['suppliers-db'], labels=['07-migrations'])
+k8s_resource('pos-migration', resource_deps=['pos-db'], labels=['07-migrations'])
+k8s_resource('orders-migration', resource_deps=['orders-db'], labels=['07-migrations'])
+k8s_resource('external-migration', resource_deps=['external-db'], labels=['07-migrations'])

 # Platform Service Migrations
-k8s_resource('notification-migration', resource_deps=['notification-db'], labels=['03-migrations'])
-k8s_resource('alert-processor-migration', resource_deps=['alert-processor-db'], labels=['03-migrations'])
-k8s_resource('orchestrator-migration', resource_deps=['orchestrator-db'], labels=['03-migrations'])
+k8s_resource('notification-migration', resource_deps=['notification-db'], labels=['07-migrations'])
+k8s_resource('alert-processor-migration', resource_deps=['alert-processor-db'], labels=['07-migrations'])
+k8s_resource('orchestrator-migration', resource_deps=['orchestrator-db'], labels=['07-migrations'])

 # Demo Service Migrations
-k8s_resource('demo-session-migration', resource_deps=['demo-session-db'], labels=['03-migrations'])
+k8s_resource('demo-session-migration', resource_deps=['demo-session-db'], labels=['07-migrations'])

 # =============================================================================
 # DATA INITIALIZATION JOBS
 # =============================================================================

-k8s_resource('external-data-init', resource_deps=['external-migration', 'redis'], labels=['04-data-init'])
-k8s_resource('nominatim-init', labels=['04-data-init'])
+k8s_resource('external-data-init', resource_deps=['external-migration', 'redis'], labels=['08-data-init'])
+k8s_resource('nominatim-init', labels=['08-data-init'])

 # =============================================================================
 # =============================================================================
@@ -517,8 +571,16 @@ Internal Schedulers Active:
  ⏰ Usage Tracking: Daily @ 2:00 AM UTC (tenant-service)

 Access your application:
-  Frontend:  http://localhost:3000 (or via ingress)
-  Gateway:   http://localhost:8000 (or via ingress)
+  Main Application:      https://localhost
+  API Endpoints:         https://localhost/api/v1/...
+
+  Service Metrics:
+  Gateway:               http://localhost:8000/metrics
+  Any Service:           kubectl port-forward <service> 8000:8000
+
+  SigNoz (Optional - see SIGNOZ_DEPLOYMENT_RECOMMENDATIONS.md):
+  Deploy manually:       ./infrastructure/helm/deploy-signoz.sh dev
+  Access (if deployed):  https://localhost/signoz

 Verify security:
  kubectl get pvc -n bakery-ia
--- a/docs/MONITORING_DEPLOYMENT_SUMMARY.md
+++ b/docs/MONITORING_DEPLOYMENT_SUMMARY.md
@@ -1,459 +0,0 @@
-# 🎉 Production Monitoring MVP - Implementation Complete
-
-**Date:** 2026-01-07
-**Status:** ✅ READY FOR PRODUCTION DEPLOYMENT
-
---
-
-## 📊 What Was Implemented
-
-### **Phase 1: Core Infrastructure** ✅
- ✅ **Prometheus v3.0.1** (2 replicas, HA mode with StatefulSet)
- ✅ **AlertManager v0.27.0** (3 replicas, clustered with gossip protocol)
- ✅ **Grafana v12.3.0** (secure credentials via Kubernetes Secrets)
- ✅ **PostgreSQL Exporter v0.15.0** (database health monitoring)
- ✅ **Node Exporter v1.7.0** (infrastructure monitoring via DaemonSet)
- ✅ **Jaeger v1.51** (distributed tracing with persistent storage)
-
-### **Phase 2: Alert Management** ✅
- ✅ **50+ Alert Rules** across 9 categories:
-  - Service health & performance
-  - Business logic (ML training, API limits)
-  - Alert system health & performance
-  - Database & infrastructure alerts
-  - Monitoring self-monitoring
- ✅ **Intelligent Alert Routing** by severity, component, and service
- ✅ **Alert Inhibition Rules** to prevent alert storms
- ✅ **Multi-Channel Notifications** (email + Slack support)
-
-### **Phase 3: High Availability** ✅
- ✅ **PodDisruptionBudgets** for all monitoring components
- ✅ **Anti-affinity Rules** to spread pods across nodes
- ✅ **ResourceQuota & LimitRange** for namespace resource management
- ✅ **StatefulSets** with volumeClaimTemplates for persistent storage
- ✅ **Headless Services** for StatefulSet DNS discovery
-
-### **Phase 4: Observability** ✅
- ✅ **11 Grafana Dashboards** (7 pre-configured + 4 extended):
-  1. Gateway Metrics
-  2. Services Overview
-  3. Circuit Breakers
-  4. PostgreSQL Database (13 panels)
-  5. Node Exporter Infrastructure (19 panels)
-  6. AlertManager Monitoring (15 panels)
-  7. Business Metrics & KPIs (21 panels)
-  8-11. Plus existing dashboards
- ✅ **Distributed Tracing** enabled in production
- ✅ **Comprehensive Documentation** with runbooks
-
---
-
-## 📁 Files Created/Modified
-
-### **New Files:**
-```
-infrastructure/kubernetes/base/components/monitoring/
-├── secrets.yaml                          # Monitoring credentials
-├── alertmanager.yaml                     # AlertManager StatefulSet (3 replicas)
-├── alertmanager-init.yaml                # Config initialization script
-├── alert-rules.yaml                      # 50+ alert rules
-├── postgres-exporter.yaml                # PostgreSQL monitoring
-├── node-exporter.yaml                    # Infrastructure monitoring (DaemonSet)
-├── grafana-dashboards-extended.yaml      # 4 comprehensive dashboards
-├── ha-policies.yaml                      # PDBs + ResourceQuota + LimitRange
-└── README.md                             # Complete documentation (500+ lines)
-```
-
-### **Modified Files:**
-```
-infrastructure/kubernetes/base/components/monitoring/
-├── prometheus.yaml                       # Now StatefulSet with 2 replicas + alert config
-├── grafana.yaml                          # Using secrets + extended dashboards mounted
-├── ingress.yaml                          # Added /alertmanager path
-└── kustomization.yaml                    # Added all new resources
-
-infrastructure/kubernetes/overlays/prod/
-├── kustomization.yaml                    # Enabled monitoring stack
-└── prod-configmap.yaml                   # JAEGER_ENABLED=true
-```
-
-### **Deleted:**
-```
-infrastructure/monitoring/                # Old legacy config (completely removed)
-```
-
---
-
-## 🚀 Deployment Instructions
-
-### **1. Update Secrets (REQUIRED BEFORE DEPLOYMENT)**
-
-```bash
-cd infrastructure/kubernetes/base/components/monitoring
-
-# Generate strong Grafana password
-GRAFANA_PASSWORD=$(openssl rand -base64 32)
-
-# Update secrets.yaml with your actual values:
-# - grafana-admin: admin-password
-# - alertmanager-secrets: SMTP credentials
-# - postgres-exporter: PostgreSQL connection string
-
-# Example for production:
-kubectl create secret generic grafana-admin \
-  --from-literal=admin-user=admin \
-  --from-literal=admin-password="${GRAFANA_PASSWORD}" \
-  --namespace monitoring --dry-run=client -o yaml | \
-  kubectl apply -f -
-```
-
-### **2. Deploy to Production**
-
-```bash
-# Apply the monitoring stack
-kubectl apply -k infrastructure/kubernetes/overlays/prod
-
-# Verify deployment
-kubectl get pods -n monitoring
-kubectl get pvc -n monitoring
-kubectl get svc -n monitoring
-```
-
-### **3. Verify Services**
-
-```bash
-# Check Prometheus targets
-kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
-# Visit: http://localhost:9090/targets
-
-# Check AlertManager cluster
-kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093
-# Visit: http://localhost:9093
-
-# Check Grafana dashboards
-kubectl port-forward -n monitoring svc/grafana 3000:3000
-# Visit: http://localhost:3000 (admin / YOUR_PASSWORD)
-```
-
---
-
-## 📈 What You Get Out of the Box
-
-### **Monitoring Coverage:**
- ✅ **Application Metrics:** Request rates, latencies (P95/P99), error rates per service
- ✅ **Database Health:** Connections, transactions, cache hit ratio, slow queries, locks
- ✅ **Infrastructure:** CPU, memory, disk I/O, network traffic per node
- ✅ **Business KPIs:** Active tenants, training jobs, alert volumes, API health
- ✅ **Distributed Traces:** Full request path tracking across microservices
-
-### **Alerting Capabilities:**
- ✅ **Service Down Detection:** 2-minute threshold with immediate notifications
- ✅ **Performance Degradation:** High latency, error rate, and memory alerts
- ✅ **Resource Exhaustion:** Database connections, disk space, memory limits
- ✅ **Business Logic:** Training job failures, low ML accuracy, rate limits
- ✅ **Alert System Health:** Component failures, delivery issues, capacity problems
-
-### **High Availability:**
- ✅ **Prometheus:** 2 independent instances, can lose 1 without data loss
- ✅ **AlertManager:** 3-node cluster, requires 2/3 for alerts to fire
- ✅ **Monitoring Resilience:** PodDisruptionBudgets ensure service during updates
-
---
-
-## 🔧 Configuration Highlights
-
-### **Alert Routing (Configured in AlertManager):**
-
-| Severity | Route | Repeat Interval |
-|----------|-------|-----------------|
-| Critical | critical-alerts@yourdomain.com + oncall@ | 4 hours |
-| Warning | alerts@yourdomain.com | 12 hours |
-| Info | alerts@yourdomain.com | 24 hours |
-
-**Special Routes:**
- Alert system → alert-system-team@yourdomain.com
- Database alerts → database-team@yourdomain.com
- Infrastructure → infra-team@yourdomain.com
-
-### **Resource Allocation:**
-
-| Component | Replicas | CPU Request | Memory Request | Storage |
-|-----------|----------|-------------|----------------|---------|
-| Prometheus | 2 | 500m | 1Gi | 20Gi × 2 |
-| AlertManager | 3 | 100m | 128Mi | 2Gi × 3 |
-| Grafana | 1 | 100m | 256Mi | 5Gi |
-| Postgres Exporter | 1 | 50m | 64Mi | - |
-| Node Exporter | 1/node | 50m | 64Mi | - |
-| Jaeger | 1 | 250m | 512Mi | 10Gi |
-
-**Total Resources:**
- CPU Requests: ~2.5 cores
- Memory Requests: ~4Gi
- Storage: ~70Gi
-
-### **Data Retention:**
- Prometheus: 30 days
- Jaeger: Persistent (BadgerDB)
- Grafana: Persistent dashboards
-
---
-
-## 🔐 Security Considerations
-
-### **Implemented:**
- ✅ Grafana credentials via Kubernetes Secrets (no hardcoded passwords)
- ✅ SMTP passwords stored in Secrets
- ✅ PostgreSQL connection strings in Secrets
- ✅ Read-only filesystem for Node Exporter
- ✅ Non-root user for Node Exporter (UID 65534)
- ✅ RBAC for Prometheus (ClusterRole with minimal permissions)
-
-### **TODO for Production:**
- ⚠️ Use Sealed Secrets or External Secrets Operator
- ⚠️ Enable TLS for Prometheus remote write (if using)
- ⚠️ Configure Grafana LDAP/OAuth integration
- ⚠️ Set up proper certificate management for Ingress
- ⚠️ Review and tighten ResourceQuota limits
-
---
-
-## 📊 Dashboard Access
-
-### **Production URLs (via Ingress):**
-```
-https://monitoring.yourdomain.com/grafana       # Grafana UI
-https://monitoring.yourdomain.com/prometheus    # Prometheus UI
-https://monitoring.yourdomain.com/alertmanager  # AlertManager UI
-https://monitoring.yourdomain.com/jaeger        # Jaeger UI
-```
-
-### **Local Access (Port Forwarding):**
-```bash
-# Grafana
-kubectl port-forward -n monitoring svc/grafana 3000:3000
-
-# Prometheus
-kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
-
-# AlertManager
-kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093
-
-# Jaeger
-kubectl port-forward -n monitoring svc/jaeger-query 16686:16686
-```
-
---
-
-## 🧪 Testing & Validation
-
-### **1. Test Alert Flow:**
-```bash
-# Fire a test alert (HighMemoryUsage)
-kubectl run memory-hog --image=polinux/stress --restart=Never \
-  --namespace=bakery-ia -- stress --vm 1 --vm-bytes 600M --timeout 300s
-
-# Check alert in Prometheus (should fire within 5 minutes)
-# Check AlertManager received it
-# Verify email notification sent
-```
-
-### **2. Verify Metrics Collection:**
-```bash
-# Check Prometheus targets (should all be UP)
-curl http://localhost:9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, health: .health}'
-
-# Verify PostgreSQL metrics
-curl http://localhost:9090/api/v1/query?query=pg_up | jq
-
-# Verify Node metrics
-curl http://localhost:9090/api/v1/query?query=node_cpu_seconds_total | jq
-```
-
-### **3. Test Jaeger Tracing:**
-```bash
-# Make a request through the gateway
-curl -H "Authorization: Bearer YOUR_TOKEN" \
-  https://api.yourdomain.com/api/v1/health
-
-# Check trace in Jaeger UI
-# Should see spans across gateway → auth → tenant services
-```
-
---
-
-## 📖 Documentation
-
-### **Complete Documentation Available:**
- **[README.md](infrastructure/kubernetes/base/components/monitoring/README.md)** - 500+ lines covering:
-  - Component overview
-  - Deployment instructions
-  - Security best practices
-  - Accessing services
-  - Dashboard descriptions
-  - Alert configuration
-  - Troubleshooting guide
-  - Metrics reference
-  - Backup & recovery procedures
-  - Maintenance tasks
-
---
-
-## ⚡ Performance & Scalability
-
-### **Current Capacity:**
- Prometheus can handle ~10M active time series
- AlertManager can process 1000s of alerts/second
- Jaeger can handle 10k spans/second
- Grafana supports 1000+ concurrent users
-
-### **Scaling Recommendations:**
- **> 20M time series:** Deploy Thanos for long-term storage
- **> 5k alerts/min:** Scale AlertManager to 5+ replicas
- **> 50k spans/sec:** Deploy Jaeger with Elasticsearch/Cassandra backend
- **> 5k Grafana users:** Scale Grafana horizontally with shared database
-
---
-
-## 🎯 Success Criteria - ALL MET ✅
-
- ✅ Prometheus collecting metrics from all services
- ✅ Alert rules evaluating and firing correctly
- ✅ AlertManager routing notifications to appropriate channels
- ✅ Grafana displaying real-time dashboards
- ✅ Jaeger capturing distributed traces
- ✅ High availability for all critical components
- ✅ Secure credential management
- ✅ Resource limits configured
- ✅ Documentation complete with runbooks
- ✅ No legacy code remaining
-
---
-
-## 🚨 Important Notes
-
-1. **Update Secrets Before Deployment:**
-   - Change all default passwords in `secrets.yaml`
-   - Use strong, randomly generated passwords
-   - Consider using Sealed Secrets for production
-
-2. **Configure SMTP Settings:**
-   - Update AlertManager SMTP configuration in secrets
-   - Test email delivery before relying on alerts
-
-3. **Review Alert Thresholds:**
-   - Current thresholds are conservative
-   - Adjust based on your SLAs and baseline metrics
-
-4. **Monitor Resource Usage:**
-   - Prometheus storage grows over time
-   - Plan for capacity based on retention period
-   - Consider cleaning up old metrics
-
-5. **Backup Strategy:**
-   - PVCs contain critical monitoring data
-   - Implement backup solution for PersistentVolumes
-   - Test restore procedures regularly
-
---
-
-## 🎓 Next Steps (Post-MVP)
-
-### **Short Term (1-2 weeks):**
-1. Fine-tune alert thresholds based on production data
-2. Add custom business metrics to services
-3. Create team-specific dashboards
-4. Set up on-call rotation in AlertManager
-
-### **Medium Term (1-3 months):**
-1. Implement SLO tracking and error budgets
-2. Deploy Loki for log aggregation
-3. Add anomaly detection for metrics
-4. Integrate with incident management (PagerDuty/Opsgenie)
-
-### **Long Term (3-6 months):**
-1. Deploy Thanos for long-term metrics storage
-2. Implement cost tracking and chargeback per tenant
-3. Add continuous profiling (Pyroscope)
-4. Build ML-based alert prediction
-
---
-
-## 📞 Support & Troubleshooting
-
-### **Common Issues:**
-
-**Issue:** Prometheus targets showing "DOWN"
-```bash
-# Check service discovery
-kubectl get svc -n bakery-ia
-kubectl get endpoints -n bakery-ia
-```
-
-**Issue:** AlertManager not sending notifications
-```bash
-# Check SMTP connectivity
-kubectl exec -n monitoring alertmanager-0 -- nc -zv smtp.gmail.com 587
-
-# Check AlertManager logs
-kubectl logs -n monitoring alertmanager-0 -f
-```
-
-**Issue:** Grafana dashboards showing "No Data"
-```bash
-# Verify Prometheus datasource
-kubectl port-forward -n monitoring svc/grafana 3000:3000
-# Login → Configuration → Data Sources → Test
-
-# Check Prometheus has data
-kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
-# Visit /graph and run query: up
-```
-
-### **Getting Help:**
- Check logs: `kubectl logs -n monitoring POD_NAME`
- Check events: `kubectl get events -n monitoring`
- Review documentation: `infrastructure/kubernetes/base/components/monitoring/README.md`
- Prometheus troubleshooting: https://prometheus.io/docs/prometheus/latest/troubleshooting/
- Grafana troubleshooting: https://grafana.com/docs/grafana/latest/troubleshooting/
-
---
-
-## ✅ Deployment Checklist
-
-Before going to production, verify:
-
- [ ] All secrets updated with production values
- [ ] SMTP configuration tested and working
- [ ] Grafana admin password changed from default
- [ ] PostgreSQL connection string configured
- [ ] Test alert fired and received via email
- [ ] All Prometheus targets are UP
- [ ] Grafana dashboards loading data
- [ ] Jaeger receiving traces
- [ ] Resource quotas appropriate for cluster size
- [ ] Backup strategy implemented for PVCs
- [ ] Team trained on accessing monitoring tools
- [ ] Runbooks reviewed and understood
- [ ] On-call rotation configured (if applicable)
-
---
-
-## 🎉 Summary
-
-**You now have a production-ready monitoring stack with:**
-
- ✅ **Complete Observability:** Metrics, logs (via stdout), and traces
- ✅ **Intelligent Alerting:** 50+ rules with smart routing and inhibition
- ✅ **Rich Visualization:** 11 dashboards covering all aspects of the system
- ✅ **High Availability:** HA for Prometheus and AlertManager
- ✅ **Security:** Secrets management, RBAC, read-only containers
- ✅ **Documentation:** Comprehensive guides and runbooks
- ✅ **Scalability:** Ready to handle production traffic
-
-**The monitoring MVP is COMPLETE and READY FOR PRODUCTION DEPLOYMENT!** 🚀
-
---
-
-*Generated: 2026-01-07*
-*Version: 1.0.0 - Production MVP*
-*Implementation Time: ~3 hours*
--- a/docs/PILOT_LAUNCH_GUIDE.md
+++ b/docs/PILOT_LAUNCH_GUIDE.md
@@ -584,23 +584,39 @@ docker push YOUR_VPS_IP:32000/bakery/auth-service

 ### Step 2: Update Production Configuration

-```bash
-# On local machine, edit these files:
+The production configuration is already set up for **bakewise.ai** domain:

+**Production URLs:**
+- **Main Application:** https://bakewise.ai
+- **API Endpoints:** https://bakewise.ai/api/v1/...
+- **Monitoring Dashboard:** https://monitoring.bakewise.ai/grafana
+- **Prometheus:** https://monitoring.bakewise.ai/prometheus
+- **SigNoz (Traces/Metrics/Logs):** https://monitoring.bakewise.ai/signoz
+- **AlertManager:** https://monitoring.bakewise.ai/alertmanager
+
+```bash
+# Verify the configuration is correct:
+cat infrastructure/kubernetes/overlays/prod/prod-ingress.yaml | grep -A 3 "host:"
+
+# Expected output should show:
+# - host: bakewise.ai
+# - host: monitoring.bakewise.ai
+
+# Verify CORS configuration
+cat infrastructure/kubernetes/overlays/prod/prod-configmap.yaml | grep CORS
+
+# Expected: CORS_ORIGINS: "https://bakewise.ai"
+```
+
+**If using a different domain**, update these files:
+```bash
 # 1. Update domain names
 nano infrastructure/kubernetes/overlays/prod/prod-ingress.yaml
-# Replace:
-# - bakery.yourdomain.com → bakery.your-actual-domain.com
-# - api.yourdomain.com → api.your-actual-domain.com
-# - monitoring.yourdomain.com → monitoring.your-actual-domain.com
-# - Update CORS origins
-# - Update cert-manager email
+# Replace bakewise.ai with your domain

 # 2. Update ConfigMap
 nano infrastructure/kubernetes/overlays/prod/prod-configmap.yaml
-# Set:
-# - DOMAIN: "your-actual-domain.com"
-# - CORS_ORIGINS: "https://bakery.your-actual-domain.com,https://www.your-actual-domain.com"
+# Update CORS_ORIGINS

 # 3. Verify image names (if using custom registry)
 nano infrastructure/kubernetes/overlays/prod/kustomization.yaml
@@ -840,22 +856,96 @@ kubectl logs -n bakery-ia deployment/auth-service | grep -i "email\|smtp"

 ## Post-Deployment

-### Step 1: Enable Monitoring
+### Step 1: Access Monitoring Stack

-```bash
-# Monitoring is already configured, verify it's running
-kubectl get pods -n monitoring
+Your production monitoring stack provides complete observability with multiple tools:

-# Access Grafana
-kubectl port-forward -n monitoring svc/grafana 3000:3000
+#### Production Monitoring URLs

-# Visit http://localhost:3000
-# Login: admin / (password from monitoring secrets)
-
-# Check dashboards are working
+Access via domain (recommended):
+```
+https://monitoring.bakewise.ai/grafana       # Dashboards & visualization
+https://monitoring.bakewise.ai/prometheus    # Metrics & queries
+https://monitoring.bakewise.ai/signoz        # Unified observability platform (traces, metrics, logs)
+https://monitoring.bakewise.ai/alertmanager  # Alert management
 ```

-### Step 2: Configure Backups
+Or via port forwarding (if needed):
+```bash
+# Grafana
+kubectl port-forward -n monitoring svc/grafana 3000:3000 &
+
+# Prometheus
+kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 &
+
+# SigNoz
+kubectl port-forward -n monitoring svc/signoz-frontend 3301:3301 &
+
+# AlertManager
+kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093 &
+```
+
+#### Available Dashboards
+
+Login to Grafana (admin / your-password) and explore:
+
+**Main Dashboards:**
+1. **Gateway Metrics** - HTTP request rates, latencies, error rates
+2. **Services Overview** - Multi-service health and performance
+3. **Circuit Breakers** - Reliability metrics
+
+**Extended Dashboards:**
+4. **Service Performance Monitoring (SPM)** - RED metrics from distributed traces
+5. **PostgreSQL Database** - Database health, connections, query performance
+6. **Node Exporter Infrastructure** - CPU, memory, disk, network per node
+7. **AlertManager Monitoring** - Alert tracking and notification status
+8. **Business Metrics & KPIs** - Tenant activity, ML jobs, forecasts
+
+#### Quick Health Check
+
+```bash
+# Verify all monitoring pods are running
+kubectl get pods -n monitoring
+
+# Check Prometheus targets (all should be UP)
+kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
+# Open: http://localhost:9090/targets
+
+# View active alerts
+kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
+# Open: http://localhost:9090/alerts
+```
+
+### Step 2: Configure Alerting
+
+Update AlertManager with your notification email addresses:
+
+```bash
+# Edit alertmanager configuration
+kubectl edit configmap -n monitoring alertmanager-config
+
+# Update recipient emails in the routes section:
+# - alerts@bakewise.ai (general alerts)
+# - critical-alerts@bakewise.ai (critical issues)
+# - oncall@bakewise.ai (on-call rotation)
+```
+
+Test alert delivery:
+```bash
+# Fire a test alert
+kubectl run memory-test --image=polinux/stress --restart=Never \
+  --namespace=bakery-ia -- stress --vm 1 --vm-bytes 600M --timeout 300s
+
+# Check alert appears in AlertManager
+# https://monitoring.bakewise.ai/alertmanager
+
+# Verify email notification received
+
+# Clean up test
+kubectl delete pod memory-test -n bakery-ia
+```
+
+### Step 3: Configure Backups

 ```bash
 # Create backup script on VPS
@@ -902,26 +992,82 @@ kubectl edit configmap -n monitoring alertmanager-config
 # Update recipient emails in the routes section
 ```

-### Step 4: Document Everything
+### Step 4: Verify Monitoring is Working

-Create a runbook with:
- [ ] VPS login credentials (stored securely)
+Before proceeding, ensure all monitoring components are operational:
+
+```bash
+# 1. Check Prometheus targets
+# Open: https://monitoring.bakewise.ai/prometheus/targets
+# All targets should show "UP" status
+
+# 2. Verify Grafana dashboards load data
+# Open: https://monitoring.bakewise.ai/grafana
+# Navigate to any dashboard and verify metrics are displaying
+
+# 3. Check SigNoz is receiving traces
+# Open: https://monitoring.bakewise.ai/signoz
+# Search for traces from "gateway" service
+
+# 4. Verify AlertManager cluster
+# Open: https://monitoring.bakewise.ai/alertmanager
+# Check that all 3 AlertManager instances are connected
+```
+
+### Step 5: Document Everything
+
+Create a secure runbook with all credentials and procedures:
+
+**Essential Information to Document:**
+- [ ] VPS login credentials (stored securely in password manager)
 - [ ] Database passwords (in password manager)
- [ ] Domain registrar access
+- [ ] Grafana admin password
+- [ ] Domain registrar access (for bakewise.ai)
 - [ ] Cloudflare access
- [ ] Email service credentials
+- [ ] Email service credentials (SMTP)
 - [ ] WhatsApp API credentials
 - [ ] Docker Hub / Registry credentials
 - [ ] Emergency contact information
 - [ ] Rollback procedures
+- [ ] Monitoring URLs and access procedures

-### Step 5: Train Your Team
+### Step 6: Train Your Team

- [ ] Show team how to access Grafana dashboards
- [ ] Demonstrate how to check logs: `kubectl logs`
- [ ] Explain how to restart services if needed
- [ ] Share this documentation with the team
- [ ] Setup on-call rotation (if applicable)
+Conduct a training session covering:
+
+- [ ] **Access monitoring dashboards**
+  - Show how to login to https://monitoring.bakewise.ai/grafana
+  - Walk through key dashboards (Services Overview, Database, Infrastructure)
+  - Explain how to interpret metrics and identify issues
+
+- [ ] **Check application logs**
+  ```bash
+  # View logs for a service
+  kubectl logs -n bakery-ia deployment/orders-service --tail=100 -f
+
+  # Search for errors
+  kubectl logs -n bakery-ia deployment/gateway | grep ERROR
+  ```
+
+- [ ] **Restart services when needed**
+  ```bash
+  # Restart a service (rolling update, no downtime)
+  kubectl rollout restart deployment/orders-service -n bakery-ia
+  ```
+
+- [ ] **Respond to alerts**
+  - Show how to access AlertManager at https://monitoring.bakewise.ai/alertmanager
+  - Review common alerts and their resolution steps
+  - Reference the [Production Operations Guide](./PRODUCTION_OPERATIONS_GUIDE.md)
+
+- [ ] **Share documentation**
+  - [PILOT_LAUNCH_GUIDE.md](./PILOT_LAUNCH_GUIDE.md) - This guide
+  - [PRODUCTION_OPERATIONS_GUIDE.md](./PRODUCTION_OPERATIONS_GUIDE.md) - Daily operations
+  - [security-checklist.md](./security-checklist.md) - Security procedures
+
+- [ ] **Setup on-call rotation** (if applicable)
+  - Configure in AlertManager
+  - Document escalation procedures

 ---

@@ -1050,16 +1196,25 @@ kubectl scale deployment monitoring -n bakery-ia --replicas=0

 ## Support Resources

- **Full Monitoring Guide:** [MONITORING_DEPLOYMENT_SUMMARY.md](./MONITORING_DEPLOYMENT_SUMMARY.md)
- **Operations Guide:** [PRODUCTION_OPERATIONS_GUIDE.md](./PRODUCTION_OPERATIONS_GUIDE.md)
- **Security Guide:** [security-checklist.md](./security-checklist.md)
- **Database Security:** [database-security.md](./database-security.md)
- **TLS Configuration:** [tls-configuration.md](./tls-configuration.md)
+**Documentation:**
+- **Operations Guide:** [PRODUCTION_OPERATIONS_GUIDE.md](./PRODUCTION_OPERATIONS_GUIDE.md) - Daily operations, monitoring, incident response
+- **Security Guide:** [security-checklist.md](./security-checklist.md) - Security procedures and compliance
+- **Database Security:** [database-security.md](./database-security.md) - Database operations and TLS configuration
+- **TLS Configuration:** [tls-configuration.md](./tls-configuration.md) - Certificate management
+- **RBAC Implementation:** [rbac-implementation.md](./rbac-implementation.md) - Access control

+**Monitoring Access:**
+- **Grafana:** https://monitoring.bakewise.ai/grafana (admin / your-password)
+- **Prometheus:** https://monitoring.bakewise.ai/prometheus
+- **SigNoz:** https://monitoring.bakewise.ai/signoz
+- **AlertManager:** https://monitoring.bakewise.ai/alertmanager
+
+**External Resources:**
 - **MicroK8s Docs:** https://microk8s.io/docs
 - **Kubernetes Docs:** https://kubernetes.io/docs
 - **Let's Encrypt:** https://letsencrypt.org/docs
 - **Cloudflare DNS:** https://developers.cloudflare.com/dns
+- **Monitoring Stack README:** infrastructure/kubernetes/base/components/monitoring/README.md

 ---

--- a/docs/PRODUCTION_OPERATIONS_GUIDE.md
+++ b/docs/PRODUCTION_OPERATIONS_GUIDE.md
@@ -32,7 +32,7 @@
 - **Services:** 18 microservices, 14 databases, monitoring stack
 - **Capacity:** 10-tenant pilot (scalable to 100+)
 - **Security:** TLS encryption, RBAC, audit logging
- **Monitoring:** Prometheus, Grafana, AlertManager, Jaeger
+- **Monitoring:** Prometheus, Grafana, AlertManager, SigNoz

 **Key Metrics (10-tenant baseline):**
 - **Uptime Target:** 99.5% (3.65 hours downtime/month)
@@ -60,10 +60,10 @@

 **Production URLs:**
 ```
-https://monitoring.yourdomain.com/grafana       # Dashboards & visualization
-https://monitoring.yourdomain.com/prometheus    # Metrics & alerts
-https://monitoring.yourdomain.com/alertmanager  # Alert management
-https://monitoring.yourdomain.com/jaeger        # Distributed tracing
+https://monitoring.bakewise.ai/grafana       # Dashboards & visualization
+https://monitoring.bakewise.ai/prometheus    # Metrics & alerts
+https://monitoring.bakewise.ai/alertmanager  # Alert management
+https://monitoring.bakewise.ai/signoz        # Unified observability platform (traces, metrics, logs)
 ```

 **Port Forwarding (if ingress not available):**
@@ -77,8 +77,8 @@ kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
 # AlertManager
 kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093

-# Jaeger
-kubectl port-forward -n monitoring svc/jaeger-query 16686:16686
+# SigNoz
+kubectl port-forward -n monitoring svc/signoz-frontend 3301:3301
 ```

 ### Key Dashboards
@@ -1099,13 +1099,12 @@ kubectl exec -n bakery-ia deployment/auth-db -- \
 ## Support Resources

 **Documentation:**
- [Pilot Launch Guide](./PILOT_LAUNCH_GUIDE.md) - Initial deployment
- [Monitoring Summary](./MONITORING_DEPLOYMENT_SUMMARY.md) - Monitoring details
- [Quick Start Monitoring](./QUICK_START_MONITORING.md) - Monitoring setup
- [Security Checklist](./security-checklist.md) - Security procedures
- [Database Security](./database-security.md) - Database operations
+- [Pilot Launch Guide](./PILOT_LAUNCH_GUIDE.md) - Initial deployment and setup
+- [Security Checklist](./security-checklist.md) - Security procedures and compliance
+- [Database Security](./database-security.md) - Database operations and best practices
 - [TLS Configuration](./tls-configuration.md) - Certificate management
- [RBAC Implementation](./rbac-implementation.md) - Access control
+- [RBAC Implementation](./rbac-implementation.md) - Access control configuration
+- [Monitoring Stack README](../infrastructure/kubernetes/base/components/monitoring/README.md) - Detailed monitoring documentation

 **External Resources:**
 - Kubernetes: https://kubernetes.io/docs
@@ -1115,9 +1114,9 @@ kubectl exec -n bakery-ia deployment/auth-db -- \
 - PostgreSQL: https://www.postgresql.org/docs

 **Emergency Contacts:**
- DevOps Team: devops@yourdomain.com
- On-Call: oncall@yourdomain.com
- Security Team: security@yourdomain.com
+- DevOps Team: devops@bakewise.ai
+- On-Call: oncall@bakewise.ai
+- Security Team: security@bakewise.ai

 ---

--- a/docs/QUICK_START_MONITORING.md
+++ b/docs/QUICK_START_MONITORING.md
@@ -1,284 +0,0 @@
-# 🚀 Quick Start: Deploy Monitoring to Production
-
-**Time to deploy: ~15 minutes**
-
---
-
-## Step 1: Update Secrets (5 min)
-
-```bash
-cd infrastructure/kubernetes/base/components/monitoring
-
-# 1. Generate strong passwords
-GRAFANA_PASS=$(openssl rand -base64 32)
-echo "Grafana Password: $GRAFANA_PASS" > ~/SAVE_THIS_PASSWORD.txt
-
-# 2. Edit secrets.yaml and replace:
-#    - CHANGE_ME_IN_PRODUCTION (Grafana password)
-#    - SMTP settings (your email server)
-#    - PostgreSQL connection string (your DB)
-
-nano secrets.yaml
-```
-
-**Required Changes in secrets.yaml:**
-```yaml
-# Line 13: Change Grafana password
-admin-password: "YOUR_STRONG_PASSWORD_HERE"
-
-# Lines 30-33: Update SMTP settings
-smtp-host: "smtp.gmail.com:587"
-smtp-username: "your-alerts@yourdomain.com"
-smtp-password: "YOUR_SMTP_PASSWORD"
-smtp-from: "alerts@yourdomain.com"
-
-# Line 49: Update PostgreSQL connection
-data-source-name: "postgresql://USER:PASSWORD@postgres.bakery-ia:5432/bakery?sslmode=require"
-```
-
---
-
-## Step 2: Update Alert Email Addresses (2 min)
-
-```bash
-# Edit alertmanager.yaml to set your team's email addresses
-nano alertmanager.yaml
-
-# Update these lines (search for @yourdomain.com):
-# - Line 93: to: 'alerts@yourdomain.com'
-# - Line 101: to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com'
-# - Line 116: to: 'alerts@yourdomain.com'
-# - Line 125: to: 'alert-system-team@yourdomain.com'
-# - Line 134: to: 'database-team@yourdomain.com'
-# - Line 143: to: 'infra-team@yourdomain.com'
-```
-
---
-
-## Step 3: Deploy to Production (3 min)
-
-```bash
-# Return to project root
-cd /Users/urtzialfaro/Documents/bakery-ia
-
-# Deploy the entire stack
-kubectl apply -k infrastructure/kubernetes/overlays/prod
-
-# Watch the pods come up
-kubectl get pods -n monitoring -w
-```
-
-**Expected Output:**
-```
-NAME                                  READY   STATUS    RESTARTS   AGE
-prometheus-0                          1/1     Running   0          2m
-prometheus-1                          1/1     Running   0          1m
-alertmanager-0                        2/2     Running   0          2m
-alertmanager-1                        2/2     Running   0          1m
-alertmanager-2                        2/2     Running   0          1m
-grafana-xxxxx                         1/1     Running   0          2m
-postgres-exporter-xxxxx               1/1     Running   0          2m
-node-exporter-xxxxx                   1/1     Running   0          2m
-jaeger-xxxxx                          1/1     Running   0          2m
-```
-
---
-
-## Step 4: Verify Deployment (3 min)
-
-```bash
-# Check all pods are running
-kubectl get pods -n monitoring
-
-# Check storage is provisioned
-kubectl get pvc -n monitoring
-
-# Check services are created
-kubectl get svc -n monitoring
-```
-
---
-
-## Step 5: Access Dashboards (2 min)
-
-### **Option A: Via Ingress (if configured)**
-```
-https://monitoring.yourdomain.com/grafana
-https://monitoring.yourdomain.com/prometheus
-https://monitoring.yourdomain.com/alertmanager
-https://monitoring.yourdomain.com/jaeger
-```
-
-### **Option B: Via Port Forwarding**
-```bash
-# Grafana
-kubectl port-forward -n monitoring svc/grafana 3000:3000 &
-
-# Prometheus
-kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 &
-
-# AlertManager
-kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093 &
-
-# Jaeger
-kubectl port-forward -n monitoring svc/jaeger-query 16686:16686 &
-
-# Now access:
-# - Grafana: http://localhost:3000 (admin / YOUR_PASSWORD)
-# - Prometheus: http://localhost:9090
-# - AlertManager: http://localhost:9093
-# - Jaeger: http://localhost:16686
-```
-
---
-
-## Step 6: Verify Everything Works (5 min)
-
-### **Check Prometheus Targets**
-1. Open Prometheus: http://localhost:9090
-2. Go to Status → Targets
-3. Verify all targets are **UP**:
-   - prometheus (1/1 up)
-   - bakery-services (multiple pods up)
-   - alertmanager (3/3 up)
-   - postgres-exporter (1/1 up)
-   - node-exporter (N/N up, where N = number of nodes)
-
-### **Check Grafana Dashboards**
-1. Open Grafana: http://localhost:3000
-2. Login with admin / YOUR_PASSWORD
-3. Go to Dashboards → Browse
-4. You should see 11 dashboards:
-   - Bakery IA folder: Gateway Metrics, Services Overview, Circuit Breakers
-   - Bakery IA - Extended folder: PostgreSQL, Node Exporter, AlertManager, Business Metrics
-5. Open any dashboard and verify data is loading
-
-### **Test Alert Flow**
-```bash
-# Fire a test alert by creating high memory pod
-kubectl run memory-test --image=polinux/stress --restart=Never \
-  --namespace=bakery-ia -- stress --vm 1 --vm-bytes 600M --timeout 300s
-
-# Wait 5 minutes, then check:
-# 1. Prometheus Alerts: http://localhost:9090/alerts
-#    - Should see "HighMemoryUsage" firing
-# 2. AlertManager: http://localhost:9093
-#    - Should see the alert
-# 3. Email inbox - Should receive notification
-
-# Clean up
-kubectl delete pod memory-test -n bakery-ia
-```
-
-### **Verify Jaeger Tracing**
-1. Make a request to your API:
-   ```bash
-   curl -H "Authorization: Bearer YOUR_TOKEN" \
-     https://api.yourdomain.com/api/v1/health
-   ```
-2. Open Jaeger: http://localhost:16686
-3. Select a service from dropdown
-4. Click "Find Traces"
-5. You should see traces appearing
-
---
-
-## ✅ Success Criteria
-
-Your monitoring is working correctly if:
-
- [x] All Prometheus targets show "UP" status
- [x] Grafana dashboards display metrics
- [x] AlertManager cluster shows 3/3 members
- [x] Test alert fired and email received
- [x] Jaeger shows traces from services
- [x] No pods in CrashLoopBackOff state
- [x] All PVCs are Bound
-
---
-
-## 🔧 Troubleshooting
-
-### **Problem: Pods not starting**
-```bash
-# Check pod status
-kubectl describe pod POD_NAME -n monitoring
-
-# Check logs
-kubectl logs POD_NAME -n monitoring
-
-# Common issues:
-# - Insufficient resources: Check node capacity
-# - PVC not binding: Check storage class exists
-# - Image pull errors: Check network/registry access
-```
-
-### **Problem: Prometheus targets DOWN**
-```bash
-# Check if services exist
-kubectl get svc -n bakery-ia
-
-# Check if pods have correct labels
-kubectl get pods -n bakery-ia --show-labels
-
-# Check if pods expose metrics port (8080)
-kubectl get pod POD_NAME -n bakery-ia -o yaml | grep -A 5 ports
-```
-
-### **Problem: Grafana shows "No Data"**
-```bash
-# Test Prometheus datasource
-kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
-
-# Run a test query in Prometheus
-curl "http://localhost:9090/api/v1/query?query=up" | jq
-
-# If Prometheus has data but Grafana doesn't, check Grafana datasource config
-```
-
-### **Problem: Alerts not firing**
-```bash
-# Check alert rules are loaded
-kubectl logs -n monitoring prometheus-0 | grep "Loading configuration"
-
-# Check AlertManager config
-kubectl exec -n monitoring alertmanager-0 -- cat /etc/alertmanager/alertmanager.yml
-
-# Test SMTP connection
-kubectl exec -n monitoring alertmanager-0 -- \
-  nc -zv smtp.gmail.com 587
-```
-
---
-
-## 📞 Need Help?
-
-1. Check full documentation: [infrastructure/kubernetes/base/components/monitoring/README.md](infrastructure/kubernetes/base/components/monitoring/README.md)
-2. Review deployment summary: [MONITORING_DEPLOYMENT_SUMMARY.md](MONITORING_DEPLOYMENT_SUMMARY.md)
-3. Check Prometheus logs: `kubectl logs -n monitoring prometheus-0`
-4. Check AlertManager logs: `kubectl logs -n monitoring alertmanager-0`
-5. Check Grafana logs: `kubectl logs -n monitoring deployment/grafana`
-
---
-
-## 🎉 You're Done!
-
-Your monitoring stack is now running in production!
-
-**Next steps:**
-1. Save your Grafana password securely
-2. Set up on-call rotation
-3. Review alert thresholds and adjust as needed
-4. Create team-specific dashboards
-5. Train team on using monitoring tools
-
-**Access your monitoring:**
- Grafana: https://monitoring.yourdomain.com/grafana
- Prometheus: https://monitoring.yourdomain.com/prometheus
- AlertManager: https://monitoring.yourdomain.com/alertmanager
- Jaeger: https://monitoring.yourdomain.com/jaeger
-
---
-
-*Deployment time: ~15 minutes*
-*Last updated: 2026-01-07*
--- a/gateway/app/main.py
+++ b/gateway/app/main.py
@@ -10,7 +10,7 @@ import resource
 import os
 from fastapi import FastAPI, Request, HTTPException, Depends, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, StreamingResponse
+from fastapi.responses import JSONResponse, StreamingResponse, Response
 import httpx
 import time
 from shared.redis_utils import initialize_redis, close_redis, get_redis_client
@@ -27,7 +27,42 @@ from app.middleware.demo_middleware import DemoMiddleware
 from app.middleware.read_only_mode import ReadOnlyModeMiddleware
 from app.routes import auth, tenant, notification, nominatim, subscription, demo, pos, geocoding, poi_context
 from shared.monitoring.logging import setup_logging
-from shared.monitoring.metrics import MetricsCollector
+from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
+
+# OpenTelemetry imports
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
+from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
+from opentelemetry.instrumentation.redis import RedisInstrumentor
+from opentelemetry.sdk.resources import Resource
+
+# Configure OpenTelemetry tracing
+def setup_tracing(service_name: str = "gateway"):
+    """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
+    # Create resource with service name
+    resource = Resource.create({"service.name": service_name})
+
+    # Configure OTLP exporter (sends to OpenTelemetry Collector)
+    otlp_exporter = OTLPSpanExporter(
+        endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"),
+        insecure=True  # Use insecure connection for internal cluster communication
+    )
+
+    # Configure tracer provider
+    provider = TracerProvider(resource=resource)
+    processor = BatchSpanProcessor(otlp_exporter)
+    provider.add_span_processor(processor)
+
+    # Set global tracer provider
+    trace.set_tracer_provider(provider)
+
+    return provider
+
+# Initialize tracing
+tracer_provider = setup_tracing("gateway")

 # Setup logging
 setup_logging("gateway", settings.LOG_LEVEL)
@@ -75,9 +110,21 @@ app = FastAPI(
    redirect_slashes=False  # Disable automatic trailing slash redirects
 )

+# Instrument FastAPI with OpenTelemetry
+FastAPIInstrumentor.instrument_app(app)
+
+# Instrument httpx for outgoing requests
+HTTPXClientInstrumentor().instrument()
+
+# Instrument Redis (will be active once redis client is initialized)
+RedisInstrumentor().instrument()
+
 # Initialize metrics collector
 metrics_collector = MetricsCollector("gateway")

+# Add metrics middleware to track HTTP requests
+add_metrics_middleware(app, metrics_collector)
+
 # Redis client for SSE streaming
 redis_client = None

@@ -182,8 +229,11 @@ async def health_check():

@app.get("/metrics")
 async def metrics():
-    """Metrics endpoint for monitoring"""
-    return {"metrics": "enabled"}
+    """Prometheus metrics endpoint"""
+    return Response(
+        content=metrics_collector.get_metrics(),
+        media_type="text/plain; version=0.0.4; charset=utf-8"
+    )

 # ================================================================
 # SERVER-SENT EVENTS (SSE) HELPER FUNCTIONS
--- a/gateway/requirements.txt
+++ b/gateway/requirements.txt
@@ -19,3 +19,9 @@ sqlalchemy==2.0.44
 asyncpg==0.30.0
 cryptography==44.0.0
 ortools==9.8.3296
+opentelemetry-api==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-instrumentation-fastapi==0.48b0
+opentelemetry-exporter-otlp-proto-grpc==1.27.0
+opentelemetry-instrumentation-httpx==0.48b0
+opentelemetry-instrumentation-redis==0.48b0
--- a/infrastructure/INFRASTRUCTURE_CLEANUP_SUMMARY.md
+++ b/infrastructure/INFRASTRUCTURE_CLEANUP_SUMMARY.md
@@ -1,201 +0,0 @@
-# Infrastructure Cleanup Summary
-
-**Date:** 2026-01-07
-**Action:** Removed legacy Docker Compose infrastructure files
-
---
-
-## Deleted Directories and Files
-
-The following legacy infrastructure files have been removed as they were specific to Docker Compose deployment and are **not used** in the Kubernetes deployment:
-
-### ❌ Removed:
- `infrastructure/pgadmin/` - pgAdmin configuration for Docker Compose
-  - `pgpass` - Password file
-  - `servers.json` - Server definitions
-  
- `infrastructure/postgres/` - PostgreSQL configuration for Docker Compose
-  - `init-scripts/init.sql` - Database initialization
-  
- `infrastructure/rabbitmq/` - RabbitMQ configuration for Docker Compose
-  - `definitions.json` - Queue/exchange definitions
-  - `rabbitmq.conf` - RabbitMQ settings
-  
- `infrastructure/redis/` - Redis configuration for Docker Compose
-  - `redis.conf` - Redis settings
-  
- `infrastructure/terraform/` - Terraform infrastructure-as-code (unused)
-  - `base/`, `dev/`, `staging/`, `production/` directories
-  - `modules/` directory
-  
- `infrastructure/rabbitmq.conf` - Standalone RabbitMQ config file
-
-### ✅ Retained:
-
-#### `infrastructure/kubernetes/`
-**Purpose:** Complete Kubernetes deployment manifests
-**Status:** Active and required
-**Contents:**
- `base/` - Base Kubernetes resources
-  - `components/` - All service deployments
-  - `databases/` - Database deployments (uses embedded configs)
-  - `monitoring/` - Prometheus, Grafana, AlertManager
-  - `migrations/` - Database migration jobs
-  - `secrets/` - TLS secrets and application secrets
-  - `configmaps/` - PostgreSQL logging config
- `overlays/` - Environment-specific configurations
-  - `dev/` - Development overlay
-  - `prod/` - Production overlay
- `encryption/` - Kubernetes secrets encryption config
-
-#### `infrastructure/tls/`
-**Purpose:** TLS/SSL certificates for database encryption
-**Status:** Active and required
-**Contents:**
- `ca/` - Certificate Authority (10-year validity)
-  - `ca-cert.pem` - CA certificate
-  - `ca-key.pem` - CA private key (KEEP SECURE!)
- `postgres/` - PostgreSQL server certificates (3-year validity)
-  - `server-cert.pem`, `server-key.pem`, `ca-cert.pem`
- `redis/` - Redis server certificates (3-year validity)
-  - `redis-cert.pem`, `redis-key.pem`, `ca-cert.pem`
- `generate-certificates.sh` - Certificate generation script
-
---
-
-## Why These Were Removed
-
-### Docker Compose vs Kubernetes
-
-The removed files were configuration files for **Docker Compose** deployments:
- pgAdmin was used for local database management (not needed in prod)
- Standalone config files (rabbitmq.conf, redis.conf, postgres init scripts) were mounted as volumes in Docker Compose
- Terraform was an unused infrastructure-as-code attempt
-
-### Kubernetes Uses Different Approach
-
-Kubernetes deployment uses:
- **ConfigMaps** instead of config files
- **Secrets** instead of environment files
- **Kubernetes manifests** instead of docker-compose.yml
- **Built-in orchestration** instead of Terraform
-
-**Example:**
-```yaml
-# OLD (Docker Compose):
-volumes:
-  - ./infrastructure/rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf
-
-# NEW (Kubernetes):
-env:
-  - name: RABBITMQ_DEFAULT_USER
-    valueFrom:
-      secretKeyRef:
-        name: rabbitmq-secrets
-        key: RABBITMQ_USER
-```
-
---
-
-## Verification
-
-### No References Found
-Searched entire codebase and confirmed **zero references** to removed folders:
-```bash
-grep -r "infrastructure/pgadmin" --include="*.yaml" --include="*.sh"
-# No results
-
-grep -r "infrastructure/terraform" --include="*.yaml" --include="*.sh"
-# No results
-```
-
-### Kubernetes Deployment Unaffected
- All services use Kubernetes ConfigMaps and Secrets
- Database configs embedded in deployment YAML files
- TLS certificates managed via Kubernetes Secrets (from `infrastructure/tls/`)
-
---
-
-## Current Infrastructure Structure
-
-```
-infrastructure/
-├── kubernetes/                  # ✅ ACTIVE - All K8s manifests
-│   ├── base/                   # Base resources
-│   │   ├── components/         # Service deployments
-│   │   ├── secrets/            # TLS secrets
-│   │   ├── configmaps/         # Configuration
-│   │   └── kustomization.yaml  # Base kustomization
-│   ├── overlays/               # Environment overlays
-│   │   ├── dev/                # Development
-│   │   └── prod/               # Production
-│   └── encryption/             # K8s secrets encryption
-└── tls/                        # ✅ ACTIVE - TLS certificates
-    ├── ca/                     # Certificate Authority
-    ├── postgres/               # PostgreSQL certs
-    ├── redis/                  # Redis certs
-    └── generate-certificates.sh
-
-REMOVED (Docker Compose legacy):
-├── pgadmin/                    # ❌ DELETED
-├── postgres/                   # ❌ DELETED
-├── rabbitmq/                   # ❌ DELETED
-├── redis/                      # ❌ DELETED
-├── terraform/                  # ❌ DELETED
-└── rabbitmq.conf              # ❌ DELETED
-```
-
---
-
-## Impact Assessment
-
-### ✅ No Breaking Changes
- Kubernetes deployment unchanged
- All services continue to work
- TLS certificates still available
- Production readiness maintained
-
-### ✅ Benefits
- Cleaner repository structure
- Less confusion about which configs are used
- Faster repository cloning (smaller size)
- Clear separation: Kubernetes-only deployment
-
-### ✅ Documentation Updated
- [PILOT_LAUNCH_GUIDE.md](../docs/PILOT_LAUNCH_GUIDE.md) - Uses only Kubernetes
- [PRODUCTION_OPERATIONS_GUIDE.md](../docs/PRODUCTION_OPERATIONS_GUIDE.md) - References only K8s resources
- [infrastructure/kubernetes/README.md](kubernetes/README.md) - K8s-specific documentation
-
---
-
-## Rollback (If Needed)
-
-If for any reason you need these files back, they can be restored from git:
-
-```bash
-# View deleted files
-git log --diff-filter=D --summary | grep infrastructure
-
-# Restore specific folder (example)
-git checkout HEAD~1 -- infrastructure/pgadmin/
-
-# Or restore all deleted infrastructure
-git checkout HEAD~1 -- infrastructure/
-```
-
-**Note:** You won't need these for Kubernetes deployment. They were Docker Compose specific.
-
---
-
-## Related Documentation
-
- [Kubernetes README](kubernetes/README.md) - K8s deployment guide
- [TLS Configuration](../docs/tls-configuration.md) - Certificate management
- [Database Security](../docs/database-security.md) - Database encryption
- [Pilot Launch Guide](../docs/PILOT_LAUNCH_GUIDE.md) - Production deployment
-
---
-
-**Cleanup Performed By:** Claude Code
-**Verified By:** Infrastructure analysis and grep searches
-**Status:** ✅ Complete - No issues found
--- a/infrastructure/helm/signoz-values-dev.yaml
+++ b/infrastructure/helm/signoz-values-dev.yaml
@@ -0,0 +1,316 @@
+# SigNoz Helm Chart Values - Development Environment
+# Optimized for local development with minimal resource usage
+#
+# Official Chart: https://github.com/SigNoz/charts
+# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-dev.yaml
+
+global:
+  storageClass: "standard"
+  domain: "localhost"
+
+# Frontend Configuration
+frontend:
+  replicaCount: 1
+  image:
+    repository: signoz/frontend
+    tag: 0.52.3
+    pullPolicy: IfNotPresent
+
+  service:
+    type: ClusterIP
+    port: 3301
+
+  ingress:
+    enabled: true
+    className: nginx
+    annotations:
+      nginx.ingress.kubernetes.io/rewrite-target: /$2
+      nginx.ingress.kubernetes.io/use-regex: "true"
+    hosts:
+      - host: localhost
+        paths:
+          - path: /signoz(/|$)(.*)
+            pathType: ImplementationSpecific
+    tls: []
+
+  resources:
+    requests:
+      cpu: 50m
+      memory: 128Mi
+    limits:
+      cpu: 200m
+      memory: 256Mi
+
+  env:
+    - name: FRONTEND_REFRESH_INTERVAL
+      value: "30000"
+
+# Query Service Configuration
+queryService:
+  replicaCount: 1
+  image:
+    repository: signoz/query-service
+    tag: 0.52.3
+    pullPolicy: IfNotPresent
+
+  service:
+    type: ClusterIP
+    port: 8080
+
+  resources:
+    requests:
+      cpu: 100m
+      memory: 256Mi
+    limits:
+      cpu: 500m
+      memory: 512Mi
+
+  env:
+    - name: DEPLOYMENT_TYPE
+      value: "kubernetes-helm"
+    - name: SIGNOZ_LOCAL_DB_PATH
+      value: "/var/lib/signoz"
+
+  persistence:
+    enabled: true
+    size: 5Gi
+    storageClass: "standard"
+
+# AlertManager Configuration
+alertmanager:
+  replicaCount: 1
+  image:
+    repository: signoz/alertmanager
+    tag: 0.23.5
+    pullPolicy: IfNotPresent
+
+  service:
+    type: ClusterIP
+    port: 9093
+
+  resources:
+    requests:
+      cpu: 50m
+      memory: 128Mi
+    limits:
+      cpu: 200m
+      memory: 256Mi
+
+  persistence:
+    enabled: true
+    size: 2Gi
+    storageClass: "standard"
+
+  config:
+    global:
+      resolve_timeout: 5m
+    route:
+      group_by: ['alertname', 'cluster', 'service']
+      group_wait: 10s
+      group_interval: 10s
+      repeat_interval: 12h
+      receiver: 'default'
+    receivers:
+      - name: 'default'
+        # Add email, slack, webhook configs here
+
+# ClickHouse Configuration - Time Series Database
+clickhouse:
+  replicaCount: 1
+  image:
+    repository: clickhouse/clickhouse-server
+    tag: 24.1.2-alpine
+    pullPolicy: IfNotPresent
+
+  service:
+    type: ClusterIP
+    httpPort: 8123
+    tcpPort: 9000
+
+  resources:
+    requests:
+      cpu: 500m
+      memory: 512Mi
+    limits:
+      cpu: 1000m
+      memory: 1Gi
+
+  persistence:
+    enabled: true
+    size: 10Gi
+    storageClass: "standard"
+
+  # ClickHouse configuration
+  config:
+    logger:
+      level: information
+    max_connections: 1024
+    max_concurrent_queries: 100
+    # Data retention (7 days for dev)
+    merge_tree:
+      parts_to_delay_insert: 150
+      parts_to_throw_insert: 300
+
+# OpenTelemetry Collector - Integrated with SigNoz
+otelCollector:
+  enabled: true
+  replicaCount: 1
+  image:
+    repository: signoz/signoz-otel-collector
+    tag: 0.102.8
+    pullPolicy: IfNotPresent
+
+  service:
+    type: ClusterIP
+    ports:
+      otlpGrpc: 4317
+      otlpHttp: 4318
+      metrics: 8888
+      healthCheck: 13133
+
+  resources:
+    requests:
+      cpu: 100m
+      memory: 256Mi
+    limits:
+      cpu: 500m
+      memory: 512Mi
+
+  # Full OTEL Collector Configuration
+  config:
+    extensions:
+      health_check:
+        endpoint: 0.0.0.0:13133
+      zpages:
+        endpoint: 0.0.0.0:55679
+
+    receivers:
+      otlp:
+        protocols:
+          grpc:
+            endpoint: 0.0.0.0:4317
+          http:
+            endpoint: 0.0.0.0:4318
+            cors:
+              allowed_origins:
+                - "http://localhost"
+                - "https://localhost"
+
+      # Prometheus receiver for scraping metrics
+      prometheus:
+        config:
+          scrape_configs:
+            - job_name: 'otel-collector'
+              scrape_interval: 30s
+              static_configs:
+                - targets: ['localhost:8888']
+
+    processors:
+      batch:
+        timeout: 10s
+        send_batch_size: 1024
+
+      memory_limiter:
+        check_interval: 1s
+        limit_mib: 400
+        spike_limit_mib: 100
+
+      # Resource detection for K8s
+      resourcedetection:
+        detectors: [env, system, docker]
+        timeout: 5s
+
+      # Add resource attributes
+      resource:
+        attributes:
+          - key: deployment.environment
+            value: development
+            action: upsert
+
+    exporters:
+      # Export to SigNoz ClickHouse
+      clickhousetraces:
+        datasource: tcp://clickhouse:9000/?database=signoz_traces
+        timeout: 10s
+
+      clickhousemetricswrite:
+        endpoint: tcp://clickhouse:9000/?database=signoz_metrics
+        timeout: 10s
+
+      clickhouselogsexporter:
+        dsn: tcp://clickhouse:9000/?database=signoz_logs
+        timeout: 10s
+
+      # Debug logging
+      logging:
+        loglevel: info
+        sampling_initial: 5
+        sampling_thereafter: 200
+
+    service:
+      extensions: [health_check, zpages]
+      pipelines:
+        traces:
+          receivers: [otlp]
+          processors: [memory_limiter, batch, resourcedetection, resource]
+          exporters: [clickhousetraces, logging]
+
+        metrics:
+          receivers: [otlp, prometheus]
+          processors: [memory_limiter, batch, resourcedetection, resource]
+          exporters: [clickhousemetricswrite]
+
+        logs:
+          receivers: [otlp]
+          processors: [memory_limiter, batch, resourcedetection, resource]
+          exporters: [clickhouselogsexporter, logging]
+
+# OpenTelemetry Collector Deployment Mode
+otelCollectorDeployment:
+  enabled: true
+  mode: deployment
+
+# Node Exporter for infrastructure metrics (optional)
+nodeExporter:
+  enabled: true
+  service:
+    type: ClusterIP
+    port: 9100
+
+  resources:
+    requests:
+      cpu: 50m
+      memory: 64Mi
+    limits:
+      cpu: 100m
+      memory: 128Mi
+
+# Schemamanager - Manages ClickHouse schema
+schemamanager:
+  enabled: true
+  image:
+    repository: signoz/signoz-schema-migrator
+    tag: 0.52.3
+    pullPolicy: IfNotPresent
+
+# Additional Configuration
+serviceAccount:
+  create: true
+  annotations: {}
+  name: ""
+
+# Security Context
+securityContext:
+  runAsNonRoot: true
+  runAsUser: 1000
+  fsGroup: 1000
+
+# Network Policies (disabled for dev)
+networkPolicy:
+  enabled: false
+
+# Monitoring SigNoz itself
+selfMonitoring:
+  enabled: true
+  serviceMonitor:
+    enabled: false
--- a/infrastructure/helm/signoz-values-prod.yaml
+++ b/infrastructure/helm/signoz-values-prod.yaml
@@ -0,0 +1,471 @@
+# SigNoz Helm Chart Values - Production Environment
+# High-availability configuration with resource optimization
+#
+# Official Chart: https://github.com/SigNoz/charts
+# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-prod.yaml
+
+global:
+  storageClass: "standard"
+  domain: "monitoring.bakewise.ai"
+
+# Frontend Configuration
+frontend:
+  replicaCount: 2
+  image:
+    repository: signoz/frontend
+    tag: 0.52.3
+    pullPolicy: IfNotPresent
+
+  service:
+    type: ClusterIP
+    port: 3301
+
+  ingress:
+    enabled: true
+    className: nginx
+    annotations:
+      nginx.ingress.kubernetes.io/rewrite-target: /$2
+      nginx.ingress.kubernetes.io/use-regex: "true"
+      cert-manager.io/cluster-issuer: "letsencrypt-prod"
+      nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    hosts:
+      - host: monitoring.bakewise.ai
+        paths:
+          - path: /signoz(/|$)(.*)
+            pathType: ImplementationSpecific
+    tls:
+      - secretName: signoz-tls
+        hosts:
+          - monitoring.bakewise.ai
+
+  resources:
+    requests:
+      cpu: 250m
+      memory: 512Mi
+    limits:
+      cpu: 500m
+      memory: 1Gi
+
+  # Pod Anti-affinity for HA
+  affinity:
+    podAntiAffinity:
+      preferredDuringSchedulingIgnoredDuringExecution:
+        - weight: 100
+          podAffinityTerm:
+            labelSelector:
+              matchExpressions:
+                - key: app
+                  operator: In
+                  values:
+                    - signoz-frontend
+            topologyKey: kubernetes.io/hostname
+
+  env:
+    - name: FRONTEND_REFRESH_INTERVAL
+      value: "30000"
+
+# Query Service Configuration
+queryService:
+  replicaCount: 2
+  image:
+    repository: signoz/query-service
+    tag: 0.52.3
+    pullPolicy: IfNotPresent
+
+  service:
+    type: ClusterIP
+    port: 8080
+
+  resources:
+    requests:
+      cpu: 500m
+      memory: 1Gi
+    limits:
+      cpu: 1000m
+      memory: 2Gi
+
+  # Pod Anti-affinity for HA
+  affinity:
+    podAntiAffinity:
+      preferredDuringSchedulingIgnoredDuringExecution:
+        - weight: 100
+          podAffinityTerm:
+            labelSelector:
+              matchExpressions:
+                - key: app
+                  operator: In
+                  values:
+                    - signoz-query-service
+            topologyKey: kubernetes.io/hostname
+
+  env:
+    - name: DEPLOYMENT_TYPE
+      value: "kubernetes-helm"
+    - name: SIGNOZ_LOCAL_DB_PATH
+      value: "/var/lib/signoz"
+    - name: RETENTION_DAYS
+      value: "30"
+
+  persistence:
+    enabled: true
+    size: 20Gi
+    storageClass: "standard"
+
+  # Horizontal Pod Autoscaler
+  autoscaling:
+    enabled: true
+    minReplicas: 2
+    maxReplicas: 5
+    targetCPUUtilizationPercentage: 70
+    targetMemoryUtilizationPercentage: 80
+
+# AlertManager Configuration
+alertmanager:
+  replicaCount: 2
+  image:
+    repository: signoz/alertmanager
+    tag: 0.23.5
+    pullPolicy: IfNotPresent
+
+  service:
+    type: ClusterIP
+    port: 9093
+
+  resources:
+    requests:
+      cpu: 250m
+      memory: 512Mi
+    limits:
+      cpu: 500m
+      memory: 1Gi
+
+  # Pod Anti-affinity for HA
+  affinity:
+    podAntiAffinity:
+      preferredDuringSchedulingIgnoredDuringExecution:
+        - weight: 100
+          podAffinityTerm:
+            labelSelector:
+              matchExpressions:
+                - key: app
+                  operator: In
+                  values:
+                    - signoz-alertmanager
+            topologyKey: kubernetes.io/hostname
+
+  persistence:
+    enabled: true
+    size: 5Gi
+    storageClass: "standard"
+
+  config:
+    global:
+      resolve_timeout: 5m
+      smtp_smarthost: 'smtp.gmail.com:587'
+      smtp_from: 'alerts@bakewise.ai'
+      smtp_auth_username: 'alerts@bakewise.ai'
+      smtp_auth_password: '${SMTP_PASSWORD}'
+      smtp_require_tls: true
+
+    route:
+      group_by: ['alertname', 'cluster', 'service', 'severity']
+      group_wait: 10s
+      group_interval: 10s
+      repeat_interval: 12h
+      receiver: 'critical-alerts'
+      routes:
+        - match:
+            severity: critical
+          receiver: 'critical-alerts'
+          continue: true
+        - match:
+            severity: warning
+          receiver: 'warning-alerts'
+
+    receivers:
+      - name: 'critical-alerts'
+        email_configs:
+          - to: 'critical-alerts@bakewise.ai'
+            headers:
+              Subject: '[CRITICAL] {{ .GroupLabels.alertname }} - Bakery IA'
+        # Slack webhook for critical alerts
+        slack_configs:
+          - api_url: '${SLACK_WEBHOOK_URL}'
+            channel: '#alerts-critical'
+            title: '[CRITICAL] {{ .GroupLabels.alertname }}'
+            text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
+
+      - name: 'warning-alerts'
+        email_configs:
+          - to: 'oncall@bakewise.ai'
+            headers:
+              Subject: '[WARNING] {{ .GroupLabels.alertname }} - Bakery IA'
+
+# ClickHouse Configuration - Time Series Database
+clickhouse:
+  replicaCount: 2
+  image:
+    repository: clickhouse/clickhouse-server
+    tag: 24.1.2-alpine
+    pullPolicy: IfNotPresent
+
+  service:
+    type: ClusterIP
+    httpPort: 8123
+    tcpPort: 9000
+
+  resources:
+    requests:
+      cpu: 1000m
+      memory: 2Gi
+    limits:
+      cpu: 2000m
+      memory: 4Gi
+
+  # Pod Anti-affinity for HA
+  affinity:
+    podAntiAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        - labelSelector:
+            matchExpressions:
+              - key: app
+                operator: In
+                values:
+                  - signoz-clickhouse
+          topologyKey: kubernetes.io/hostname
+
+  persistence:
+    enabled: true
+    size: 100Gi
+    storageClass: "standard"
+
+  # ClickHouse configuration
+  config:
+    logger:
+      level: information
+    max_connections: 4096
+    max_concurrent_queries: 500
+    # Data retention (30 days for prod)
+    merge_tree:
+      parts_to_delay_insert: 150
+      parts_to_throw_insert: 300
+    # Performance tuning
+    max_memory_usage: 10000000000
+    max_bytes_before_external_group_by: 20000000000
+
+  # Backup configuration
+  backup:
+    enabled: true
+    schedule: "0 2 * * *"
+    retention: 7
+
+# OpenTelemetry Collector - Integrated with SigNoz
+otelCollector:
+  enabled: true
+  replicaCount: 2
+  image:
+    repository: signoz/signoz-otel-collector
+    tag: 0.102.8
+    pullPolicy: IfNotPresent
+
+  service:
+    type: ClusterIP
+    ports:
+      otlpGrpc: 4317
+      otlpHttp: 4318
+      metrics: 8888
+      healthCheck: 13133
+
+  resources:
+    requests:
+      cpu: 500m
+      memory: 512Mi
+    limits:
+      cpu: 1000m
+      memory: 1Gi
+
+  # Full OTEL Collector Configuration
+  config:
+    extensions:
+      health_check:
+        endpoint: 0.0.0.0:13133
+      zpages:
+        endpoint: 0.0.0.0:55679
+
+    receivers:
+      otlp:
+        protocols:
+          grpc:
+            endpoint: 0.0.0.0:4317
+            max_recv_msg_size_mib: 16
+          http:
+            endpoint: 0.0.0.0:4318
+            cors:
+              allowed_origins:
+                - "https://monitoring.bakewise.ai"
+                - "https://*.bakewise.ai"
+
+      # Prometheus receiver for scraping metrics
+      prometheus:
+        config:
+          scrape_configs:
+            - job_name: 'otel-collector'
+              scrape_interval: 30s
+              static_configs:
+                - targets: ['localhost:8888']
+
+    processors:
+      batch:
+        timeout: 10s
+        send_batch_size: 2048
+        send_batch_max_size: 4096
+
+      memory_limiter:
+        check_interval: 1s
+        limit_mib: 800
+        spike_limit_mib: 200
+
+      # Resource detection for K8s
+      resourcedetection:
+        detectors: [env, system, docker]
+        timeout: 5s
+
+      # Add resource attributes
+      resource:
+        attributes:
+          - key: deployment.environment
+            value: production
+            action: upsert
+          - key: cluster.name
+            value: bakery-ia-prod
+            action: upsert
+
+    exporters:
+      # Export to SigNoz ClickHouse
+      clickhousetraces:
+        datasource: tcp://clickhouse:9000/?database=signoz_traces
+        timeout: 10s
+        retry_on_failure:
+          enabled: true
+          initial_interval: 5s
+          max_interval: 30s
+          max_elapsed_time: 300s
+
+      clickhousemetricswrite:
+        endpoint: tcp://clickhouse:9000/?database=signoz_metrics
+        timeout: 10s
+        retry_on_failure:
+          enabled: true
+          initial_interval: 5s
+          max_interval: 30s
+          max_elapsed_time: 300s
+
+      clickhouselogsexporter:
+        dsn: tcp://clickhouse:9000/?database=signoz_logs
+        timeout: 10s
+        retry_on_failure:
+          enabled: true
+          initial_interval: 5s
+          max_interval: 30s
+          max_elapsed_time: 300s
+
+      # Minimal logging for prod
+      logging:
+        loglevel: warn
+        sampling_initial: 2
+        sampling_thereafter: 500
+
+    service:
+      extensions: [health_check, zpages]
+      pipelines:
+        traces:
+          receivers: [otlp]
+          processors: [memory_limiter, batch, resourcedetection, resource]
+          exporters: [clickhousetraces, logging]
+
+        metrics:
+          receivers: [otlp, prometheus]
+          processors: [memory_limiter, batch, resourcedetection, resource]
+          exporters: [clickhousemetricswrite]
+
+        logs:
+          receivers: [otlp]
+          processors: [memory_limiter, batch, resourcedetection, resource]
+          exporters: [clickhouselogsexporter, logging]
+
+# OpenTelemetry Collector Deployment Mode
+otelCollectorDeployment:
+  enabled: true
+  mode: deployment
+
+  # HPA for OTEL Collector
+  autoscaling:
+    enabled: true
+    minReplicas: 2
+    maxReplicas: 10
+    targetCPUUtilizationPercentage: 70
+    targetMemoryUtilizationPercentage: 80
+
+# Node Exporter for infrastructure metrics
+nodeExporter:
+  enabled: true
+  service:
+    type: ClusterIP
+    port: 9100
+
+  resources:
+    requests:
+      cpu: 100m
+      memory: 128Mi
+    limits:
+      cpu: 200m
+      memory: 256Mi
+
+# Schemamanager - Manages ClickHouse schema
+schemamanager:
+  enabled: true
+  image:
+    repository: signoz/signoz-schema-migrator
+    tag: 0.52.3
+    pullPolicy: IfNotPresent
+
+# Additional Configuration
+serviceAccount:
+  create: true
+  annotations: {}
+  name: "signoz"
+
+# Security Context
+securityContext:
+  runAsNonRoot: true
+  runAsUser: 1000
+  fsGroup: 1000
+
+# Pod Disruption Budgets for HA
+podDisruptionBudget:
+  frontend:
+    enabled: true
+    minAvailable: 1
+  queryService:
+    enabled: true
+    minAvailable: 1
+  alertmanager:
+    enabled: true
+    minAvailable: 1
+  clickhouse:
+    enabled: true
+    minAvailable: 1
+
+# Network Policies for security
+networkPolicy:
+  enabled: true
+  policyTypes:
+    - Ingress
+    - Egress
+
+# Monitoring SigNoz itself
+selfMonitoring:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    interval: 30s
--- a/infrastructure/kubernetes/README.md
+++ b/infrastructure/kubernetes/README.md
@@ -4,7 +4,7 @@ This directory contains Kubernetes manifests for deploying the Bakery IA platfor

 ## Quick Start

-Deploy the entire platform with these 5 commands:
+Deploy the entire platform with these 4 commands:

 ```bash
 # 1. Start Colima with adequate resources
@@ -17,15 +17,14 @@ kind create cluster --config kind-config.yaml
 kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml
 kubectl wait --namespace ingress-nginx --for=condition=ready pod --selector=app.kubernetes.io/component=controller --timeout=300s

-# 4. Configure permanent localhost access
-kubectl patch svc ingress-nginx-controller -n ingress-nginx -p '{"spec":{"type":"NodePort","ports":[{"name":"http","port":80,"targetPort":"http","nodePort":30080},{"name":"https","port":443,"targetPort":"https","nodePort":30443}]}}'
+# 4. Deploy with Tilt
+tilt up

-# 5. Deploy with Skaffold
-skaffold dev --profile=dev
-
-# 🎉 Access at: https://localhost
+# 🎉 Access at: http://localhost (or see Tilt for individual service ports)
 ```

+> **Note**: The kind-config.yaml already configures port mappings (30080→80, 30443→443) for localhost access, so no additional service patching is needed. The NGINX Ingress for Kind uses NodePort by default on those exact ports.
+
 ## Prerequisites

 Install the following tools on macOS:
@@ -100,11 +99,11 @@ Then access via:

 ### Start Development Environment
 ```bash
-# Start development mode with hot-reload
-skaffold dev --profile=dev
+# Start development mode with hot-reload using Tilt
+tilt up

-# Or one-time deployment
-skaffold run --profile=dev
+# Or start in background
+tilt up --stream
 ```

 ### Key Features
@@ -246,13 +245,39 @@ colima stop --profile k8s-local

 ### Restart Sequence
 ```bash
-# Post-restart startup
+# Post-restart startup (or use kubernetes_restart.sh script)
 colima start --cpu 6 --memory 12 --disk 120 --runtime docker --profile k8s-local
 kind create cluster --config kind-config.yaml
-skaffold dev --profile=dev
+kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml
+kubectl wait --namespace ingress-nginx --for=condition=ready pod --selector=app.kubernetes.io/component=controller --timeout=300s
+tilt up
 ```

-## Production Considerations
+## Production Deployment
+
+### Production URLs
+
+The production environment uses the following domains:
+
+- **Main Application**: https://bakewise.ai
+  - Frontend application and all public pages
+  - API endpoints: https://bakewise.ai/api/v1/...
+
+- **Monitoring Stack**: https://monitoring.bakewise.ai
+  - Grafana: https://monitoring.bakewise.ai/grafana
+  - Prometheus: https://monitoring.bakewise.ai/prometheus
+  - Jaeger: https://monitoring.bakewise.ai/jaeger
+  - AlertManager: https://monitoring.bakewise.ai/alertmanager
+
+### Production Configuration
+
+The production overlay (`overlays/prod/`) includes:
+- **Domain Configuration**: bakewise.ai with Let's Encrypt certificates
+- **High Availability**: Multi-replica deployments (2-3 replicas per service)
+- **Enhanced Security**: Rate limiting, CORS restrictions, security headers
+- **Monitoring**: Full observability stack with Prometheus, Grafana, Jaeger
+
+### Production Considerations

 For production deployment:

@@ -263,6 +288,7 @@ For production deployment:
 - **External Secrets**: Use managed secret services
 - **TLS**: Production Let's Encrypt certificates
 - **CI/CD**: Automated deployment pipelines
+- **DNS**: Configure DNS A/CNAME records pointing to your cluster's load balancer

 ## Next Steps

--- a/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml
+++ b/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml
@@ -48,6 +48,9 @@ spec:
            name: pos-integration-secrets
        - secretRef:
            name: whatsapp-secrets
+        env:
+        - name: OTEL_EXPORTER_OTLP_ENDPOINT
+          value: "http://otel-collector.monitoring.svc.cluster.local:4317"
        resources:
          requests:
            memory: "256Mi"
--- a/infrastructure/kubernetes/base/components/monitoring/alert-rules.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/alert-rules.yaml
@@ -1,429 +0,0 @@
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: prometheus-alert-rules
-  namespace: monitoring
-data:
-  alert-rules.yml: |
-    groups:
-    # Basic Infrastructure Alerts
-    - name: bakery_services
-      interval: 30s
-      rules:
-      - alert: ServiceDown
-        expr: up{job="bakery-services"} == 0
-        for: 2m
-        labels:
-          severity: critical
-          component: infrastructure
-        annotations:
-          summary: "Service {{ $labels.service }} is down"
-          description: "Service {{ $labels.service }} in namespace {{ $labels.namespace }} has been down for more than 2 minutes."
-          runbook_url: "https://runbooks.bakery-ia.local/ServiceDown"
-
-      - alert: HighErrorRate
-        expr: |
-          (
-            sum(rate(http_requests_total{status_code=~"5..", job="bakery-services"}[5m])) by (service)
-            /
-            sum(rate(http_requests_total{job="bakery-services"}[5m])) by (service)
-          ) > 0.10
-        for: 5m
-        labels:
-          severity: critical
-          component: application
-        annotations:
-          summary: "High error rate on {{ $labels.service }}"
-          description: "Service {{ $labels.service }} has error rate above 10% (current: {{ $value | humanizePercentage }})."
-          runbook_url: "https://runbooks.bakery-ia.local/HighErrorRate"
-
-      - alert: HighResponseTime
-        expr: |
-          histogram_quantile(0.95,
-            sum(rate(http_request_duration_seconds_bucket{job="bakery-services"}[5m])) by (service, le)
-          ) > 1
-        for: 5m
-        labels:
-          severity: warning
-          component: performance
-        annotations:
-          summary: "High response time on {{ $labels.service }}"
-          description: "Service {{ $labels.service }} P95 latency is above 1 second (current: {{ $value }}s)."
-          runbook_url: "https://runbooks.bakery-ia.local/HighResponseTime"
-
-      - alert: HighMemoryUsage
-        expr: |
-          container_memory_usage_bytes{namespace="bakery-ia", container!=""} > 500000000
-        for: 5m
-        labels:
-          severity: warning
-          component: infrastructure
-        annotations:
-          summary: "High memory usage in {{ $labels.pod }}"
-          description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using more than 500MB of memory (current: {{ $value | humanize }}B)."
-          runbook_url: "https://runbooks.bakery-ia.local/HighMemoryUsage"
-
-      - alert: DatabaseConnectionHigh
-        expr: |
-          pg_stat_database_numbackends{datname="bakery"} > 80
-        for: 5m
-        labels:
-          severity: warning
-          component: database
-        annotations:
-          summary: "High database connection count"
-          description: "Database has more than 80 active connections (current: {{ $value }})."
-          runbook_url: "https://runbooks.bakery-ia.local/DatabaseConnectionHigh"
-
-    # Business Logic Alerts
-    - name: bakery_business
-      interval: 30s
-      rules:
-      - alert: TrainingJobFailed
-        expr: |
-          increase(training_job_failures_total[1h]) > 0
-        for: 5m
-        labels:
-          severity: warning
-          component: ml-training
-        annotations:
-          summary: "Training job failures detected"
-          description: "{{ $value }} training job(s) failed in the last hour."
-          runbook_url: "https://runbooks.bakery-ia.local/TrainingJobFailed"
-
-      - alert: LowPredictionAccuracy
-        expr: |
-          prediction_model_accuracy < 0.70
-        for: 15m
-        labels:
-          severity: warning
-          component: ml-inference
-        annotations:
-          summary: "Model prediction accuracy is low"
-          description: "Model {{ $labels.model_name }} accuracy is below 70% (current: {{ $value | humanizePercentage }})."
-          runbook_url: "https://runbooks.bakery-ia.local/LowPredictionAccuracy"
-
-      - alert: APIRateLimitHit
-        expr: |
-          increase(rate_limit_hits_total[5m]) > 10
-        for: 5m
-        labels:
-          severity: info
-          component: api-gateway
-        annotations:
-          summary: "API rate limits being hit frequently"
-          description: "Rate limits hit {{ $value }} times in the last 5 minutes."
-          runbook_url: "https://runbooks.bakery-ia.local/APIRateLimitHit"
-
-    # Alert System Health
-    - name: alert_system_health
-      interval: 30s
-      rules:
-      - alert: AlertSystemComponentDown
-        expr: |
-          alert_system_component_health{component=~"processor|notifier|scheduler"} == 0
-        for: 2m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "Alert system component {{ $labels.component }} is unhealthy"
-          description: "Component {{ $labels.component }} has been unhealthy for more than 2 minutes."
-          runbook_url: "https://runbooks.bakery-ia.local/AlertSystemComponentDown"
-
-      - alert: RabbitMQConnectionDown
-        expr: |
-          rabbitmq_up == 0
-        for: 1m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "RabbitMQ connection is down"
-          description: "Alert system has lost connection to RabbitMQ message queue."
-          runbook_url: "https://runbooks.bakery-ia.local/RabbitMQConnectionDown"
-
-      - alert: RedisConnectionDown
-        expr: |
-          redis_up == 0
-        for: 1m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "Redis connection is down"
-          description: "Alert system has lost connection to Redis cache."
-          runbook_url: "https://runbooks.bakery-ia.local/RedisConnectionDown"
-
-      - alert: NoSchedulerLeader
-        expr: |
-          sum(alert_system_scheduler_leader) == 0
-        for: 5m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "No alert scheduler leader elected"
-          description: "No scheduler instance has been elected as leader for 5 minutes."
-          runbook_url: "https://runbooks.bakery-ia.local/NoSchedulerLeader"
-
-    # Alert System Performance
-    - name: alert_system_performance
-      interval: 30s
-      rules:
-      - alert: HighAlertProcessingErrorRate
-        expr: |
-          (
-            sum(rate(alert_processing_errors_total[2m]))
-            /
-            sum(rate(alerts_processed_total[2m]))
-          ) > 0.10
-        for: 2m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "High alert processing error rate"
-          description: "Alert processing error rate is above 10% (current: {{ $value | humanizePercentage }})."
-          runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingErrorRate"
-
-      - alert: HighNotificationDeliveryFailureRate
-        expr: |
-          (
-            sum(rate(notification_delivery_failures_total[3m]))
-            /
-            sum(rate(notifications_sent_total[3m]))
-          ) > 0.05
-        for: 3m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "High notification delivery failure rate"
-          description: "Notification delivery failure rate is above 5% (current: {{ $value | humanizePercentage }})."
-          runbook_url: "https://runbooks.bakery-ia.local/HighNotificationDeliveryFailureRate"
-
-      - alert: HighAlertProcessingLatency
-        expr: |
-          histogram_quantile(0.95,
-            sum(rate(alert_processing_duration_seconds_bucket[5m])) by (le)
-          ) > 5
-        for: 5m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "High alert processing latency"
-          description: "P95 alert processing latency is above 5 seconds (current: {{ $value }}s)."
-          runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingLatency"
-
-      - alert: TooManySSEConnections
-        expr: |
-          sse_active_connections > 1000
-        for: 2m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "Too many active SSE connections"
-          description: "More than 1000 active SSE connections (current: {{ $value }})."
-          runbook_url: "https://runbooks.bakery-ia.local/TooManySSEConnections"
-
-      - alert: SSEConnectionErrors
-        expr: |
-          rate(sse_connection_errors_total[3m]) > 0.5
-        for: 3m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "High rate of SSE connection errors"
-          description: "SSE connection error rate is {{ $value }} errors/sec."
-          runbook_url: "https://runbooks.bakery-ia.local/SSEConnectionErrors"
-
-    # Alert System Business Logic
-    - name: alert_system_business
-      interval: 30s
-      rules:
-      - alert: UnusuallyHighAlertVolume
-        expr: |
-          rate(alerts_generated_total[5m]) > 2
-        for: 5m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "Unusually high alert generation volume"
-          description: "More than 2 alerts per second being generated (current: {{ $value }}/sec)."
-          runbook_url: "https://runbooks.bakery-ia.local/UnusuallyHighAlertVolume"
-
-      - alert: NoAlertsGenerated
-        expr: |
-          rate(alerts_generated_total[30m]) == 0
-        for: 15m
-        labels:
-          severity: info
-          component: alert-system
-        annotations:
-          summary: "No alerts generated recently"
-          description: "No alerts have been generated in the last 30 minutes. This might indicate a problem with alert detection."
-          runbook_url: "https://runbooks.bakery-ia.local/NoAlertsGenerated"
-
-      - alert: SlowAlertResponseTime
-        expr: |
-          histogram_quantile(0.95,
-            sum(rate(alert_response_time_seconds_bucket[10m])) by (le)
-          ) > 3600
-        for: 10m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "Slow alert response times"
-          description: "P95 alert response time is above 1 hour (current: {{ $value | humanizeDuration }})."
-          runbook_url: "https://runbooks.bakery-ia.local/SlowAlertResponseTime"
-
-      - alert: CriticalAlertsUnacknowledged
-        expr: |
-          sum(alerts_unacknowledged{severity="critical"}) > 5
-        for: 10m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "Multiple critical alerts unacknowledged"
-          description: "{{ $value }} critical alerts have not been acknowledged for 10+ minutes."
-          runbook_url: "https://runbooks.bakery-ia.local/CriticalAlertsUnacknowledged"
-
-    # Alert System Capacity
-    - name: alert_system_capacity
-      interval: 30s
-      rules:
-      - alert: LargeSSEMessageQueues
-        expr: |
-          sse_message_queue_size > 100
-        for: 5m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "Large SSE message queues detected"
-          description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages queued."
-          runbook_url: "https://runbooks.bakery-ia.local/LargeSSEMessageQueues"
-
-      - alert: SlowDatabaseStorage
-        expr: |
-          histogram_quantile(0.95,
-            sum(rate(alert_storage_duration_seconds_bucket[5m])) by (le)
-          ) > 1
-        for: 5m
-        labels:
-          severity: warning
-          component: alert-system
-        annotations:
-          summary: "Slow alert database storage"
-          description: "P95 alert storage latency is above 1 second (current: {{ $value }}s)."
-          runbook_url: "https://runbooks.bakery-ia.local/SlowDatabaseStorage"
-
-    # Alert System Critical Scenarios
-    - name: alert_system_critical
-      interval: 15s
-      rules:
-      - alert: AlertSystemDown
-        expr: |
-          up{service=~"alert-processor|notification-service"} == 0
-        for: 1m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "Alert system is completely down"
-          description: "Core alert system service {{ $labels.service }} is down."
-          runbook_url: "https://runbooks.bakery-ia.local/AlertSystemDown"
-
-      - alert: AlertDataNotPersisted
-        expr: |
-          (
-            sum(rate(alerts_processed_total[2m]))
-            -
-            sum(rate(alerts_stored_total[2m]))
-          ) > 0
-        for: 2m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "Alerts not being persisted to database"
-          description: "Alerts are being processed but not stored in the database."
-          runbook_url: "https://runbooks.bakery-ia.local/AlertDataNotPersisted"
-
-      - alert: NotificationsNotDelivered
-        expr: |
-          (
-            sum(rate(alerts_processed_total[3m]))
-            -
-            sum(rate(notifications_sent_total[3m]))
-          ) > 0
-        for: 3m
-        labels:
-          severity: critical
-          component: alert-system
-        annotations:
-          summary: "Notifications not being delivered"
-          description: "Alerts are being processed but notifications are not being sent."
-          runbook_url: "https://runbooks.bakery-ia.local/NotificationsNotDelivered"
-
-    # Monitoring System Self-Monitoring
-    - name: monitoring_health
-      interval: 30s
-      rules:
-      - alert: PrometheusDown
-        expr: up{job="prometheus"} == 0
-        for: 5m
-        labels:
-          severity: critical
-          component: monitoring
-        annotations:
-          summary: "Prometheus is down"
-          description: "Prometheus monitoring system is not responding."
-          runbook_url: "https://runbooks.bakery-ia.local/PrometheusDown"
-
-      - alert: AlertManagerDown
-        expr: up{job="alertmanager"} == 0
-        for: 2m
-        labels:
-          severity: critical
-          component: monitoring
-        annotations:
-          summary: "AlertManager is down"
-          description: "AlertManager is not responding. Alerts will not be routed."
-          runbook_url: "https://runbooks.bakery-ia.local/AlertManagerDown"
-
-      - alert: PrometheusStorageFull
-        expr: |
-          (
-            prometheus_tsdb_storage_blocks_bytes
-            /
-            (prometheus_tsdb_storage_blocks_bytes + prometheus_tsdb_wal_size_bytes)
-          ) > 0.90
-        for: 10m
-        labels:
-          severity: warning
-          component: monitoring
-        annotations:
-          summary: "Prometheus storage almost full"
-          description: "Prometheus storage is {{ $value | humanizePercentage }} full."
-          runbook_url: "https://runbooks.bakery-ia.local/PrometheusStorageFull"
-
-      - alert: PrometheusScrapeErrors
-        expr: |
-          rate(prometheus_target_scrapes_exceeded_sample_limit_total[5m]) > 0
-        for: 5m
-        labels:
-          severity: warning
-          component: monitoring
-        annotations:
-          summary: "Prometheus scrape errors detected"
-          description: "Prometheus is experiencing scrape errors for target {{ $labels.job }}."
-          runbook_url: "https://runbooks.bakery-ia.local/PrometheusScrapeErrors"
--- a/infrastructure/kubernetes/base/components/monitoring/alertmanager-init.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/alertmanager-init.yaml
@@ -1,27 +0,0 @@
---
-# InitContainer to substitute secrets into AlertManager config
-# This allows us to use environment variables from secrets in the config file
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: alertmanager-init-script
-  namespace: monitoring
-data:
-  init-config.sh: |
-    #!/bin/sh
-    set -e
-
-    # Read the template config
-    TEMPLATE=$(cat /etc/alertmanager-template/alertmanager.yml)
-
-    # Substitute environment variables
-    echo "$TEMPLATE" | \
-      sed "s|{{ .smtp_host }}|${SMTP_HOST}|g" | \
-      sed "s|{{ .smtp_from }}|${SMTP_FROM}|g" | \
-      sed "s|{{ .smtp_username }}|${SMTP_USERNAME}|g" | \
-      sed "s|{{ .smtp_password }}|${SMTP_PASSWORD}|g" | \
-      sed "s|{{ .slack_webhook_url }}|${SLACK_WEBHOOK_URL}|g" \
-      > /etc/alertmanager-final/alertmanager.yml
-
-    echo "AlertManager config initialized successfully"
-    cat /etc/alertmanager-final/alertmanager.yml
--- a/infrastructure/kubernetes/base/components/monitoring/alertmanager.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/alertmanager.yaml
@@ -1,391 +0,0 @@
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: alertmanager-config
-  namespace: monitoring
-data:
-  alertmanager.yml: |
-    global:
-      resolve_timeout: 5m
-      smtp_smarthost: '{{ .smtp_host }}'
-      smtp_from: '{{ .smtp_from }}'
-      smtp_auth_username: '{{ .smtp_username }}'
-      smtp_auth_password: '{{ .smtp_password }}'
-      smtp_require_tls: true
-
-    # Define notification templates
-    templates:
-    - '/etc/alertmanager/templates/*.tmpl'
-
-    # Route alerts to appropriate receivers
-    route:
-      # Default receiver
-      receiver: 'default-email'
-      # Group alerts by these labels
-      group_by: ['alertname', 'cluster', 'service']
-      # Wait time before sending initial notification
-      group_wait: 10s
-      # Wait time before sending notifications about new alerts in the group
-      group_interval: 10s
-      # Wait time before re-sending a notification
-      repeat_interval: 12h
-
-      # Child routes for specific alert routing
-      routes:
-      # Critical alerts - send immediately to all channels
-      - match:
-          severity: critical
-        receiver: 'critical-alerts'
-        group_wait: 0s
-        group_interval: 5m
-        repeat_interval: 4h
-        continue: true
-
-      # Warning alerts - less urgent
-      - match:
-          severity: warning
-        receiver: 'warning-alerts'
-        group_wait: 30s
-        group_interval: 5m
-        repeat_interval: 12h
-
-      # Alert system specific alerts
-      - match:
-          component: alert-system
-        receiver: 'alert-system-team'
-        group_wait: 10s
-        repeat_interval: 6h
-
-      # Database alerts
-      - match_re:
-          alertname: ^(DatabaseConnectionHigh|SlowDatabaseStorage)$
-        receiver: 'database-team'
-        group_wait: 30s
-        repeat_interval: 8h
-
-      # Infrastructure alerts
-      - match_re:
-          alertname: ^(HighMemoryUsage|ServiceDown)$
-        receiver: 'infra-team'
-        group_wait: 30s
-        repeat_interval: 6h
-
-    # Inhibition rules - prevent alert spam
-    inhibit_rules:
-    # If service is down, inhibit all other alerts for that service
-    - source_match:
-        alertname: 'ServiceDown'
-      target_match_re:
-        alertname: '(HighErrorRate|HighResponseTime|HighMemoryUsage)'
-      equal: ['service']
-
-    # If AlertSystem is completely down, inhibit component alerts
-    - source_match:
-        alertname: 'AlertSystemDown'
-      target_match_re:
-        alertname: 'AlertSystemComponent.*'
-      equal: ['namespace']
-
-    # If RabbitMQ is down, inhibit alert processing errors
-    - source_match:
-        alertname: 'RabbitMQConnectionDown'
-      target_match:
-        alertname: 'HighAlertProcessingErrorRate'
-      equal: ['namespace']
-
-    # Receivers - notification destinations
-    receivers:
-    # Default email receiver
-    - name: 'default-email'
-      email_configs:
-      - to: 'alerts@yourdomain.com'
-        headers:
-          Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
-        html: |
-          {{ range .Alerts }}
-          <h2>{{ .Labels.alertname }}</h2>
-          <p><strong>Status:</strong> {{ .Status }}</p>
-          <p><strong>Severity:</strong> {{ .Labels.severity }}</p>
-          <p><strong>Service:</strong> {{ .Labels.service }}</p>
-          <p><strong>Summary:</strong> {{ .Annotations.summary }}</p>
-          <p><strong>Description:</strong> {{ .Annotations.description }}</p>
-          <p><strong>Started:</strong> {{ .StartsAt }}</p>
-          {{ if .EndsAt }}<p><strong>Ended:</strong> {{ .EndsAt }}</p>{{ end }}
-          {{ end }}
-
-    # Critical alerts - multiple channels
-    - name: 'critical-alerts'
-      email_configs:
-      - to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com'
-        headers:
-          Subject: '🚨 [CRITICAL] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
-        send_resolved: true
-      # Uncomment to enable Slack notifications
-      # slack_configs:
-      # - api_url: '{{ .slack_webhook_url }}'
-      #   channel: '#alerts-critical'
-      #   title: '🚨 Critical Alert'
-      #   text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
-      #   send_resolved: true
-
-    # Warning alerts
-    - name: 'warning-alerts'
-      email_configs:
-      - to: 'alerts@yourdomain.com'
-        headers:
-          Subject: '⚠️ [WARNING] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
-        send_resolved: true
-
-    # Alert system team
-    - name: 'alert-system-team'
-      email_configs:
-      - to: 'alert-system-team@yourdomain.com'
-        headers:
-          Subject: '[Alert System] {{ .GroupLabels.alertname }}'
-        send_resolved: true
-
-    # Database team
-    - name: 'database-team'
-      email_configs:
-      - to: 'database-team@yourdomain.com'
-        headers:
-          Subject: '[Database] {{ .GroupLabels.alertname }}'
-        send_resolved: true
-
-    # Infrastructure team
-    - name: 'infra-team'
-      email_configs:
-      - to: 'infra-team@yourdomain.com'
-        headers:
-          Subject: '[Infrastructure] {{ .GroupLabels.alertname }}'
-        send_resolved: true
-
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: alertmanager-templates
-  namespace: monitoring
-data:
-  default.tmpl: |
-    {{ define "cluster" }}{{ .ExternalURL | reReplaceAll ".*alertmanager\\.(.*)" "$1" }}{{ end }}
-
-    {{ define "slack.default.title" }}
-    [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.alertname }}
-    {{ end }}
-
-    {{ define "slack.default.text" }}
-    {{ range .Alerts }}
-    *Alert:* {{ .Annotations.summary }}
-    *Description:* {{ .Annotations.description }}
-    *Severity:* `{{ .Labels.severity }}`
-    *Service:* `{{ .Labels.service }}`
-    {{ end }}
-    {{ end }}
-
---
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
-  name: alertmanager
-  namespace: monitoring
-  labels:
-    app: alertmanager
-spec:
-  serviceName: alertmanager
-  replicas: 3
-  selector:
-    matchLabels:
-      app: alertmanager
-  template:
-    metadata:
-      labels:
-        app: alertmanager
-    spec:
-      serviceAccountName: prometheus
-      initContainers:
-      - name: init-config
-        image: busybox:1.36
-        command: ['/bin/sh', '/scripts/init-config.sh']
-        env:
-        - name: SMTP_HOST
-          valueFrom:
-            secretKeyRef:
-              name: alertmanager-secrets
-              key: smtp-host
-        - name: SMTP_USERNAME
-          valueFrom:
-            secretKeyRef:
-              name: alertmanager-secrets
-              key: smtp-username
-        - name: SMTP_PASSWORD
-          valueFrom:
-            secretKeyRef:
-              name: alertmanager-secrets
-              key: smtp-password
-        - name: SMTP_FROM
-          valueFrom:
-            secretKeyRef:
-              name: alertmanager-secrets
-              key: smtp-from
-        - name: SLACK_WEBHOOK_URL
-          valueFrom:
-            secretKeyRef:
-              name: alertmanager-secrets
-              key: slack-webhook-url
-              optional: true
-        volumeMounts:
-        - name: init-script
-          mountPath: /scripts
-        - name: config-template
-          mountPath: /etc/alertmanager-template
-        - name: config-final
-          mountPath: /etc/alertmanager-final
-      affinity:
-        podAntiAffinity:
-          preferredDuringSchedulingIgnoredDuringExecution:
-          - weight: 100
-            podAffinityTerm:
-              labelSelector:
-                matchExpressions:
-                - key: app
-                  operator: In
-                  values:
-                  - alertmanager
-              topologyKey: kubernetes.io/hostname
-      containers:
-      - name: alertmanager
-        image: prom/alertmanager:v0.27.0
-        args:
-        - '--config.file=/etc/alertmanager/alertmanager.yml'
-        - '--storage.path=/alertmanager'
-        - '--cluster.listen-address=0.0.0.0:9094'
-        - '--cluster.peer=alertmanager-0.alertmanager.monitoring.svc.cluster.local:9094'
-        - '--cluster.peer=alertmanager-1.alertmanager.monitoring.svc.cluster.local:9094'
-        - '--cluster.peer=alertmanager-2.alertmanager.monitoring.svc.cluster.local:9094'
-        - '--cluster.reconnect-timeout=5m'
-        - '--web.external-url=http://monitoring.bakery-ia.local/alertmanager'
-        - '--web.route-prefix=/'
-        ports:
-        - name: web
-          containerPort: 9093
-        - name: mesh-tcp
-          containerPort: 9094
-        - name: mesh-udp
-          containerPort: 9094
-          protocol: UDP
-        env:
-        - name: POD_NAME
-          valueFrom:
-            fieldRef:
-              fieldPath: metadata.name
-        volumeMounts:
-        - name: config-final
-          mountPath: /etc/alertmanager
-        - name: templates
-          mountPath: /etc/alertmanager/templates
-        - name: storage
-          mountPath: /alertmanager
-        resources:
-          requests:
-            memory: "128Mi"
-            cpu: "100m"
-          limits:
-            memory: "256Mi"
-            cpu: "500m"
-        livenessProbe:
-          httpGet:
-            path: /-/healthy
-            port: 9093
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /-/ready
-            port: 9093
-          initialDelaySeconds: 5
-          periodSeconds: 5
-
-      # Config reloader sidecar
-      - name: configmap-reload
-        image: jimmidyson/configmap-reload:v0.12.0
-        args:
-        - '--webhook-url=http://localhost:9093/-/reload'
-        - '--volume-dir=/etc/alertmanager'
-        volumeMounts:
-        - name: config-final
-          mountPath: /etc/alertmanager
-          readOnly: true
-        resources:
-          requests:
-            memory: "16Mi"
-            cpu: "10m"
-          limits:
-            memory: "32Mi"
-            cpu: "50m"
-
-      volumes:
-      - name: init-script
-        configMap:
-          name: alertmanager-init-script
-          defaultMode: 0755
-      - name: config-template
-        configMap:
-          name: alertmanager-config
-      - name: config-final
-        emptyDir: {}
-      - name: templates
-        configMap:
-          name: alertmanager-templates
-
-  volumeClaimTemplates:
-  - metadata:
-      name: storage
-    spec:
-      accessModes: [ "ReadWriteOnce" ]
-      resources:
-        requests:
-          storage: 2Gi
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: alertmanager
-  namespace: monitoring
-  labels:
-    app: alertmanager
-spec:
-  type: ClusterIP
-  clusterIP: None
-  ports:
-  - name: web
-    port: 9093
-    targetPort: 9093
-  - name: mesh-tcp
-    port: 9094
-    targetPort: 9094
-  - name: mesh-udp
-    port: 9094
-    targetPort: 9094
-    protocol: UDP
-  selector:
-    app: alertmanager
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: alertmanager-external
-  namespace: monitoring
-  labels:
-    app: alertmanager
-spec:
-  type: ClusterIP
-  ports:
-  - name: web
-    port: 9093
-    targetPort: 9093
-  selector:
-    app: alertmanager
--- a/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards-extended.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards-extended.yaml
@@ -1,949 +0,0 @@
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: grafana-dashboards-extended
-  namespace: monitoring
-data:
-  postgresql-dashboard.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - PostgreSQL Database",
-        "tags": ["bakery-ia", "postgresql", "database"],
-        "timezone": "browser",
-        "refresh": "30s",
-        "schemaVersion": 16,
-        "version": 1,
-        "panels": [
-          {
-            "id": 1,
-            "title": "Active Connections by Database",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "pg_stat_activity_count{state=\"active\"}",
-                "legendFormat": "{{datname}} - active"
-              },
-              {
-                "expr": "pg_stat_activity_count{state=\"idle\"}",
-                "legendFormat": "{{datname}} - idle"
-              },
-              {
-                "expr": "pg_stat_activity_count{state=\"idle in transaction\"}",
-                "legendFormat": "{{datname}} - idle tx"
-              }
-            ]
-          },
-          {
-            "id": 2,
-            "title": "Total Connections",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "sum(pg_stat_activity_count)",
-                "legendFormat": "Total connections"
-              }
-            ]
-          },
-          {
-            "id": 3,
-            "title": "Max Connections",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "pg_settings_max_connections",
-                "legendFormat": "Max connections"
-              }
-            ]
-          },
-          {
-            "id": 4,
-            "title": "Transaction Rate (Commits vs Rollbacks)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(pg_stat_database_xact_commit[5m])",
-                "legendFormat": "{{datname}} - commits"
-              },
-              {
-                "expr": "rate(pg_stat_database_xact_rollback[5m])",
-                "legendFormat": "{{datname}} - rollbacks"
-              }
-            ]
-          },
-          {
-            "id": 5,
-            "title": "Cache Hit Ratio",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (1 - (sum(rate(pg_stat_io_blocks_read_total[5m])) / (sum(rate(pg_stat_io_blocks_read_total[5m])) + sum(rate(pg_stat_io_blocks_hit_total[5m])))))",
-                "legendFormat": "Cache hit ratio %"
-              }
-            ]
-          },
-          {
-            "id": 6,
-            "title": "Slow Queries (> 30s)",
-            "type": "table",
-            "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "pg_slow_queries{duration_ms > 30000}",
-                "format": "table",
-                "instant": true
-              }
-            ],
-            "transformations": [
-              {
-                "id": "organize",
-                "options": {
-                  "excludeByName": {},
-                  "indexByName": {},
-                  "renameByName": {
-                    "query": "Query",
-                    "duration_ms": "Duration (ms)",
-                    "datname": "Database"
-                  }
-                }
-              }
-            ]
-          },
-          {
-            "id": 7,
-            "title": "Dead Tuples by Table",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "pg_stat_user_tables_n_dead_tup",
-                "legendFormat": "{{schemaname}}.{{relname}}"
-              }
-            ]
-          },
-          {
-            "id": 8,
-            "title": "Table Bloat Estimate",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (pg_stat_user_tables_n_dead_tup * avg_tuple_size) / (pg_total_relation_size * 8192)",
-                "legendFormat": "{{schemaname}}.{{relname}} bloat %"
-              }
-            ]
-          },
-          {
-            "id": 9,
-            "title": "Replication Lag (bytes)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "pg_replication_lag_bytes",
-                "legendFormat": "{{slot_name}} - {{application_name}}"
-              }
-            ]
-          },
-          {
-            "id": 10,
-            "title": "Database Size (GB)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "pg_database_size_bytes / 1024 / 1024 / 1024",
-                "legendFormat": "{{datname}}"
-              }
-            ]
-          },
-          {
-            "id": 11,
-            "title": "Database Size Growth (per hour)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(pg_database_size_bytes[1h])",
-                "legendFormat": "{{datname}} - bytes/hour"
-              }
-            ]
-          },
-          {
-            "id": 12,
-            "title": "Lock Counts by Type",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "pg_locks_count",
-                "legendFormat": "{{datname}} - {{locktype}} - {{mode}}"
-              }
-            ]
-          },
-          {
-            "id": 13,
-            "title": "Query Duration (p95)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 40, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "histogram_quantile(0.95, rate(pg_query_duration_seconds_bucket[5m]))",
-                "legendFormat": "p95"
-              }
-            ]
-          }
-        ]
-      }
-    }
-
-  node-exporter-dashboard.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - Node Exporter Infrastructure",
-        "tags": ["bakery-ia", "node-exporter", "infrastructure"],
-        "timezone": "browser",
-        "refresh": "15s",
-        "schemaVersion": 16,
-        "version": 1,
-        "panels": [
-          {
-            "id": 1,
-            "title": "CPU Usage by Node",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
-                "legendFormat": "{{instance}} - {{cpu}}"
-              }
-            ]
-          },
-          {
-            "id": 2,
-            "title": "Average CPU Usage",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
-                "legendFormat": "Average CPU %"
-              }
-            ]
-          },
-          {
-            "id": 3,
-            "title": "CPU Load (1m, 5m, 15m)",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "avg(node_load1)",
-                "legendFormat": "1m"
-              },
-              {
-                "expr": "avg(node_load5)",
-                "legendFormat": "5m"
-              },
-              {
-                "expr": "avg(node_load15)",
-                "legendFormat": "15m"
-              }
-            ]
-          },
-          {
-            "id": 4,
-            "title": "Memory Usage by Node",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          },
-          {
-            "id": 5,
-            "title": "Memory Used (GB)",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          },
-          {
-            "id": 6,
-            "title": "Memory Available (GB)",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "node_memory_MemAvailable_bytes / 1024 / 1024 / 1024",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          },
-          {
-            "id": 7,
-            "title": "Disk I/O Read Rate (MB/s)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_disk_read_bytes_total[5m]) / 1024 / 1024",
-                "legendFormat": "{{instance}} - {{device}}"
-              }
-            ]
-          },
-          {
-            "id": 8,
-            "title": "Disk I/O Write Rate (MB/s)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_disk_written_bytes_total[5m]) / 1024 / 1024",
-                "legendFormat": "{{instance}} - {{device}}"
-              }
-            ]
-          },
-          {
-            "id": 9,
-            "title": "Disk I/O Operations (IOPS)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_disk_reads_completed_total[5m]) + rate(node_disk_writes_completed_total[5m])",
-                "legendFormat": "{{instance}} - {{device}}"
-              }
-            ]
-          },
-          {
-            "id": 10,
-            "title": "Network Receive Rate (Mbps)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
-                "legendFormat": "{{instance}} - {{device}}"
-              }
-            ]
-          },
-          {
-            "id": 11,
-            "title": "Network Transmit Rate (Mbps)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
-                "legendFormat": "{{instance}} - {{device}}"
-              }
-            ]
-          },
-          {
-            "id": 12,
-            "title": "Network Errors",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_network_receive_errs_total[5m]) + rate(node_network_transmit_errs_total[5m])",
-                "legendFormat": "{{instance}} - {{device}}"
-              }
-            ]
-          },
-          {
-            "id": 13,
-            "title": "Filesystem Usage by Mount",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes))",
-                "legendFormat": "{{instance}} - {{mountpoint}}"
-              }
-            ]
-          },
-          {
-            "id": 14,
-            "title": "Filesystem Available (GB)",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "node_filesystem_avail_bytes / 1024 / 1024 / 1024",
-                "legendFormat": "{{instance}} - {{mountpoint}}"
-              }
-            ]
-          },
-          {
-            "id": 15,
-            "title": "Filesystem Size (GB)",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "node_filesystem_size_bytes / 1024 / 1024 / 1024",
-                "legendFormat": "{{instance}} - {{mountpoint}}"
-              }
-            ]
-          },
-          {
-            "id": 16,
-            "title": "Load Average (1m, 5m, 15m)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "node_load1",
-                "legendFormat": "{{instance}} - 1m"
-              },
-              {
-                "expr": "node_load5",
-                "legendFormat": "{{instance}} - 5m"
-              },
-              {
-                "expr": "node_load15",
-                "legendFormat": "{{instance}} - 15m"
-              }
-            ]
-          },
-          {
-            "id": 17,
-            "title": "System Up Time",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "node_boot_time_seconds",
-                "legendFormat": "{{instance}} - uptime"
-              }
-            ]
-          },
-          {
-            "id": 18,
-            "title": "Context Switches",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 56, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_context_switches_total[5m])",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          },
-          {
-            "id": 19,
-            "title": "Interrupts",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 56, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(node_intr_total[5m])",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          }
-        ]
-      }
-    }
-
-  alertmanager-dashboard.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - AlertManager Monitoring",
-        "tags": ["bakery-ia", "alertmanager", "alerting"],
-        "timezone": "browser",
-        "refresh": "10s",
-        "schemaVersion": 16,
-        "version": 1,
-        "panels": [
-          {
-            "id": 1,
-            "title": "Active Alerts by Severity",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "count by (severity) (ALERTS{alertstate=\"firing\"})",
-                "legendFormat": "{{severity}}"
-              }
-            ]
-          },
-          {
-            "id": 2,
-            "title": "Total Active Alerts",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "count(ALERTS{alertstate=\"firing\"})",
-                "legendFormat": "Active alerts"
-              }
-            ]
-          },
-          {
-            "id": 3,
-            "title": "Critical Alerts",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "count(ALERTS{alertstate=\"firing\", severity=\"critical\"})",
-                "legendFormat": "Critical"
-              }
-            ]
-          },
-          {
-            "id": 4,
-            "title": "Alert Firing Rate (per minute)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(alertmanager_alerts_fired_total[1m])",
-                "legendFormat": "Alerts fired/min"
-              }
-            ]
-          },
-          {
-            "id": 5,
-            "title": "Alert Resolution Rate (per minute)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(alertmanager_alerts_resolved_total[1m])",
-                "legendFormat": "Alerts resolved/min"
-              }
-            ]
-          },
-          {
-            "id": 6,
-            "title": "Notification Success Rate",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (rate(alertmanager_notifications_total{status=\"success\"}[5m]) / rate(alertmanager_notifications_total[5m]))",
-                "legendFormat": "Success rate %"
-              }
-            ]
-          },
-          {
-            "id": 7,
-            "title": "Notification Failures",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(alertmanager_notifications_total{status=\"failed\"}[5m])",
-                "legendFormat": "{{integration}}"
-              }
-            ]
-          },
-          {
-            "id": 8,
-            "title": "Silenced Alerts",
-            "type": "stat",
-            "gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "count(ALERTS{alertstate=\"silenced\"})",
-                "legendFormat": "Silenced"
-              }
-            ]
-          },
-          {
-            "id": 9,
-            "title": "AlertManager Cluster Size",
-            "type": "stat",
-            "gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "count(alertmanager_cluster_peers)",
-                "legendFormat": "Cluster peers"
-              }
-            ]
-          },
-          {
-            "id": 10,
-            "title": "AlertManager Peers",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 24, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "alertmanager_cluster_peers",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          },
-          {
-            "id": 11,
-            "title": "Cluster Status",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 24, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "up{job=\"alertmanager\"}",
-                "legendFormat": "{{instance}}"
-              }
-            ]
-          },
-          {
-            "id": 12,
-            "title": "Alerts by Group",
-            "type": "table",
-            "gridPos": {"x": 0, "y": 28, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "count by (alertname) (ALERTS{alertstate=\"firing\"})",
-                "format": "table",
-                "instant": true
-              }
-            ],
-            "transformations": [
-              {
-                "id": "organize",
-                "options": {
-                  "excludeByName": {},
-                  "indexByName": {},
-                  "renameByName": {
-                    "alertname": "Alert Name",
-                    "Value": "Count"
-                  }
-                }
-              }
-            ]
-          },
-          {
-            "id": 13,
-            "title": "Alert Duration (p99)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 28, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "histogram_quantile(0.99, rate(alertmanager_alert_duration_seconds_bucket[5m]))",
-                "legendFormat": "p99 duration"
-              }
-            ]
-          },
-          {
-            "id": 14,
-            "title": "Processing Time",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 36, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(alertmanager_receiver_processing_duration_seconds_sum[5m]) / rate(alertmanager_receiver_processing_duration_seconds_count[5m])",
-                "legendFormat": "{{receiver}}"
-              }
-            ]
-          },
-          {
-            "id": 15,
-            "title": "Memory Usage",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 36, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "process_resident_memory_bytes{job=\"alertmanager\"} / 1024 / 1024",
-                "legendFormat": "{{instance}} - MB"
-              }
-            ]
-          }
-        ]
-      }
-    }
-
-  business-metrics-dashboard.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - Business Metrics & KPIs",
-        "tags": ["bakery-ia", "business-metrics", "kpis"],
-        "timezone": "browser",
-        "refresh": "30s",
-        "schemaVersion": 16,
-        "version": 1,
-        "panels": [
-          {
-            "id": 1,
-            "title": "Requests per Service (Rate)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "sum by (service) (rate(http_requests_total[5m]))",
-                "legendFormat": "{{service}}"
-              }
-            ]
-          },
-          {
-            "id": 2,
-            "title": "Total Request Rate",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "sum(rate(http_requests_total[5m]))",
-                "legendFormat": "requests/sec"
-              }
-            ]
-          },
-          {
-            "id": 3,
-            "title": "Peak Request Rate (5m)",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "max(sum(rate(http_requests_total[5m])))",
-                "legendFormat": "Peak requests/sec"
-              }
-            ]
-          },
-          {
-            "id": 4,
-            "title": "Error Rates by Service",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))",
-                "legendFormat": "{{service}}"
-              }
-            ]
-          },
-          {
-            "id": 5,
-            "title": "Overall Error Rate",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "100 * (sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])))",
-                "legendFormat": "Error %"
-              }
-            ]
-          },
-          {
-            "id": 6,
-            "title": "4xx Error Rate",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "100 * (sum(rate(http_requests_total{status_code=~\"4..\"}[5m])) / sum(rate(http_requests_total[5m])))",
-                "legendFormat": "4xx %"
-              }
-            ]
-          },
-          {
-            "id": 7,
-            "title": "P95 Latency by Service (ms)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "histogram_quantile(0.95, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
-                "legendFormat": "{{service}} p95"
-              }
-            ]
-          },
-          {
-            "id": 8,
-            "title": "P99 Latency by Service (ms)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
-                "legendFormat": "{{service}} p99"
-              }
-            ]
-          },
-          {
-            "id": 9,
-            "title": "Average Latency (ms)",
-            "type": "stat",
-            "gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "(sum(rate(http_request_duration_seconds_sum[5m])) / sum(rate(http_request_duration_seconds_count[5m]))) * 1000",
-                "legendFormat": "Avg latency ms"
-              }
-            ]
-          },
-          {
-            "id": 10,
-            "title": "Active Tenants",
-            "type": "stat",
-            "gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "count(count by (tenant_id) (rate(http_requests_total[5m])))",
-                "legendFormat": "Active tenants"
-              }
-            ]
-          },
-          {
-            "id": 11,
-            "title": "Requests per Tenant",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 24, "w": 12, "h": 4},
-            "targets": [
-              {
-                "expr": "sum by (tenant_id) (rate(http_requests_total[5m]))",
-                "legendFormat": "Tenant {{tenant_id}}"
-              }
-            ]
-          },
-          {
-            "id": 12,
-            "title": "Alert Generation Rate (per minute)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "rate(ALERTS_FOR_STATE[1m])",
-                "legendFormat": "{{alertname}}"
-              }
-            ]
-          },
-          {
-            "id": 13,
-            "title": "Training Job Success Rate",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (sum(training_job_completed_total{status=\"success\"}) / sum(training_job_completed_total))",
-                "legendFormat": "Success rate %"
-              }
-            ]
-          },
-          {
-            "id": 14,
-            "title": "Training Jobs in Progress",
-            "type": "stat",
-            "gridPos": {"x": 0, "y": 40, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "count(training_job_in_progress)",
-                "legendFormat": "Jobs running"
-              }
-            ]
-          },
-          {
-            "id": 15,
-            "title": "Training Job Completion Time (p95, minutes)",
-            "type": "stat",
-            "gridPos": {"x": 6, "y": 40, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "histogram_quantile(0.95, training_job_duration_seconds) / 60",
-                "legendFormat": "p95 minutes"
-              }
-            ]
-          },
-          {
-            "id": 16,
-            "title": "Failed Training Jobs",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "sum(training_job_completed_total{status=\"failed\"})",
-                "legendFormat": "Failed jobs"
-              }
-            ]
-          },
-          {
-            "id": 17,
-            "title": "Total Training Jobs Completed",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
-            "targets": [
-              {
-                "expr": "sum(training_job_completed_total)",
-                "legendFormat": "Total completed"
-              }
-            ]
-          },
-          {
-            "id": 18,
-            "title": "API Health Status",
-            "type": "table",
-            "gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "up{job=\"bakery-services\"}",
-                "format": "table",
-                "instant": true
-              }
-            ],
-            "transformations": [
-              {
-                "id": "organize",
-                "options": {
-                  "excludeByName": {},
-                  "indexByName": {},
-                  "renameByName": {
-                    "service": "Service",
-                    "Value": "Status",
-                    "instance": "Instance"
-                  }
-                }
-              }
-            ]
-          },
-          {
-            "id": 19,
-            "title": "Service Success Rate (%)",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
-            "targets": [
-              {
-                "expr": "100 * (1 - (sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum by (service) (rate(http_requests_total[5m]))))",
-                "legendFormat": "{{service}}"
-              }
-            ]
-          },
-          {
-            "id": 20,
-            "title": "Requests Processed Today",
-            "type": "stat",
-            "gridPos": {"x": 0, "y": 56, "w": 12, "h": 4},
-            "targets": [
-              {
-                "expr": "sum(increase(http_requests_total[24h]))",
-                "legendFormat": "Requests (24h)"
-              }
-            ]
-          },
-          {
-            "id": 21,
-            "title": "Distinct Users Today",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 56, "w": 12, "h": 4},
-            "targets": [
-              {
-                "expr": "count(count by (user_id) (increase(http_requests_total{user_id!=\"\"}[24h])))",
-                "legendFormat": "Users (24h)"
-              }
-            ]
-          }
-        ]
-      }
-    }
--- a/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards.yaml
@@ -1,177 +0,0 @@
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: grafana-dashboards
-  namespace: monitoring
-data:
-  gateway-metrics.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - Gateway Metrics",
-        "tags": ["bakery-ia", "gateway"],
-        "timezone": "browser",
-        "panels": [
-          {
-            "id": 1,
-            "title": "Request Rate by Endpoint",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "rate(http_requests_total{service=\"gateway\"}[5m])",
-              "legendFormat": "{{method}} {{endpoint}}"
-            }]
-          },
-          {
-            "id": 2,
-            "title": "P95 Request Latency",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"gateway\"}[5m]))",
-              "legendFormat": "{{endpoint}} p95"
-            }]
-          },
-          {
-            "id": 3,
-            "title": "Error Rate (5xx)",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "rate(http_requests_total{service=\"gateway\",status_code=~\"5..\"}[5m])",
-              "legendFormat": "{{endpoint}} errors"
-            }]
-          },
-          {
-            "id": 4,
-            "title": "Active Requests",
-            "type": "stat",
-            "gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
-            "targets": [{
-              "expr": "sum(rate(http_requests_total{service=\"gateway\"}[1m]))"
-            }]
-          },
-          {
-            "id": 5,
-            "title": "Authentication Success Rate",
-            "type": "stat",
-            "gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
-            "targets": [{
-              "expr": "rate(gateway_auth_responses_total[5m]) / rate(gateway_auth_requests_total[5m]) * 100"
-            }]
-          }
-        ],
-        "refresh": "10s",
-        "schemaVersion": 16,
-        "version": 1
-      }
-    }
-
-  services-overview.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - Services Overview",
-        "tags": ["bakery-ia", "services"],
-        "timezone": "browser",
-        "panels": [
-          {
-            "id": 1,
-            "title": "Request Rate by Service",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "sum by (service) (rate(http_requests_total[5m]))",
-              "legendFormat": "{{service}}"
-            }]
-          },
-          {
-            "id": 2,
-            "title": "P99 Latency by Service",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m])))",
-              "legendFormat": "{{service}} p99"
-            }]
-          },
-          {
-            "id": 3,
-            "title": "Error Rate by Service",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 8, "w": 24, "h": 8},
-            "targets": [{
-              "expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))",
-              "legendFormat": "{{service}}"
-            }]
-          },
-          {
-            "id": 4,
-            "title": "Service Health Status",
-            "type": "table",
-            "gridPos": {"x": 0, "y": 16, "w": 24, "h": 8},
-            "targets": [{
-              "expr": "up{job=\"bakery-services\"}",
-              "format": "table",
-              "instant": true
-            }],
-            "transformations": [{
-              "id": "organize",
-              "options": {
-                "excludeByName": {},
-                "indexByName": {},
-                "renameByName": {
-                  "service": "Service Name",
-                  "Value": "Status"
-                }
-              }
-            }]
-          }
-        ],
-        "refresh": "30s",
-        "schemaVersion": 16,
-        "version": 1
-      }
-    }
-
-  circuit-breakers.json: |
-    {
-      "dashboard": {
-        "title": "Bakery IA - Circuit Breakers",
-        "tags": ["bakery-ia", "reliability"],
-        "timezone": "browser",
-        "panels": [
-          {
-            "id": 1,
-            "title": "Circuit Breaker States",
-            "type": "stat",
-            "gridPos": {"x": 0, "y": 0, "w": 24, "h": 4},
-            "targets": [{
-              "expr": "circuit_breaker_state",
-              "legendFormat": "{{service}} - {{state}}"
-            }]
-          },
-          {
-            "id": 2,
-            "title": "Circuit Breaker Trips",
-            "type": "graph",
-            "gridPos": {"x": 0, "y": 4, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "rate(circuit_breaker_opened_total[5m])",
-              "legendFormat": "{{service}}"
-            }]
-          },
-          {
-            "id": 3,
-            "title": "Rejected Requests",
-            "type": "graph",
-            "gridPos": {"x": 12, "y": 4, "w": 12, "h": 8},
-            "targets": [{
-              "expr": "rate(circuit_breaker_rejected_total[5m])",
-              "legendFormat": "{{service}}"
-            }]
-          }
-        ],
-        "refresh": "10s",
-        "schemaVersion": 16,
-        "version": 1
-      }
-    }
--- a/infrastructure/kubernetes/base/components/monitoring/grafana.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/grafana.yaml
@@ -1,166 +0,0 @@
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: grafana-datasources
-  namespace: monitoring
-data:
-  prometheus.yaml: |
-    apiVersion: 1
-    datasources:
-    - name: Prometheus
-      type: prometheus
-      access: proxy
-      url: http://prometheus:9090
-      isDefault: true
-      editable: false
-
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: grafana-dashboards-config
-  namespace: monitoring
-data:
-  dashboards.yaml: |
-    apiVersion: 1
-    providers:
-    - name: 'default'
-      orgId: 1
-      folder: 'Bakery IA'
-      type: file
-      disableDeletion: false
-      updateIntervalSeconds: 10
-      allowUiUpdates: true
-      options:
-        path: /var/lib/grafana/dashboards
-    - name: 'extended'
-      orgId: 1
-      folder: 'Bakery IA - Extended'
-      type: file
-      disableDeletion: false
-      updateIntervalSeconds: 10
-      allowUiUpdates: true
-      options:
-        path: /var/lib/grafana/dashboards-extended
-
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: grafana
-  namespace: monitoring
-  labels:
-    app: grafana
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: grafana
-  template:
-    metadata:
-      labels:
-        app: grafana
-    spec:
-      containers:
-      - name: grafana
-        image: grafana/grafana:12.3.0
-        ports:
-        - containerPort: 3000
-          name: http
-        env:
-        - name: GF_SECURITY_ADMIN_USER
-          valueFrom:
-            secretKeyRef:
-              name: grafana-admin
-              key: admin-user
-        - name: GF_SECURITY_ADMIN_PASSWORD
-          valueFrom:
-            secretKeyRef:
-              name: grafana-admin
-              key: admin-password
-        - name: GF_SERVER_ROOT_URL
-          value: "http://monitoring.bakery-ia.local/grafana"
-        - name: GF_SERVER_SERVE_FROM_SUB_PATH
-          value: "true"
-        - name: GF_AUTH_ANONYMOUS_ENABLED
-          value: "false"
-        - name: GF_INSTALL_PLUGINS
-          value: ""
-        volumeMounts:
-        - name: grafana-storage
-          mountPath: /var/lib/grafana
-        - name: grafana-datasources
-          mountPath: /etc/grafana/provisioning/datasources
-        - name: grafana-dashboards-config
-          mountPath: /etc/grafana/provisioning/dashboards
-        - name: grafana-dashboards
-          mountPath: /var/lib/grafana/dashboards
-        - name: grafana-dashboards-extended
-          mountPath: /var/lib/grafana/dashboards-extended
-        resources:
-          requests:
-            memory: "256Mi"
-            cpu: "100m"
-          limits:
-            memory: "512Mi"
-            cpu: "500m"
-        livenessProbe:
-          httpGet:
-            path: /api/health
-            port: 3000
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /api/health
-            port: 3000
-          initialDelaySeconds: 5
-          periodSeconds: 5
-      volumes:
-      - name: grafana-storage
-        persistentVolumeClaim:
-          claimName: grafana-storage
-      - name: grafana-datasources
-        configMap:
-          name: grafana-datasources
-      - name: grafana-dashboards-config
-        configMap:
-          name: grafana-dashboards-config
-      - name: grafana-dashboards
-        configMap:
-          name: grafana-dashboards
-      - name: grafana-dashboards-extended
-        configMap:
-          name: grafana-dashboards-extended
-
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: grafana-storage
-  namespace: monitoring
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 5Gi
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: grafana
-  namespace: monitoring
-  labels:
-    app: grafana
-spec:
-  type: ClusterIP
-  ports:
-  - port: 3000
-    targetPort: 3000
-    protocol: TCP
-    name: http
-  selector:
-    app: grafana
--- a/infrastructure/kubernetes/base/components/monitoring/ha-policies.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/ha-policies.yaml
@@ -1,100 +0,0 @@
---
-# PodDisruptionBudgets ensure minimum availability during voluntary disruptions
-# (node drains, rolling updates, etc.)
-
-apiVersion: policy/v1
-kind: PodDisruptionBudget
-metadata:
-  name: prometheus-pdb
-  namespace: monitoring
-spec:
-  minAvailable: 1
-  selector:
-    matchLabels:
-      app: prometheus
-
---
-apiVersion: policy/v1
-kind: PodDisruptionBudget
-metadata:
-  name: alertmanager-pdb
-  namespace: monitoring
-spec:
-  minAvailable: 2
-  selector:
-    matchLabels:
-      app: alertmanager
-
---
-apiVersion: policy/v1
-kind: PodDisruptionBudget
-metadata:
-  name: grafana-pdb
-  namespace: monitoring
-spec:
-  minAvailable: 1
-  selector:
-    matchLabels:
-      app: grafana
-
---
-# ResourceQuota limits total resources in monitoring namespace
-apiVersion: v1
-kind: ResourceQuota
-metadata:
-  name: monitoring-quota
-  namespace: monitoring
-spec:
-  hard:
-    # Compute resources
-    requests.cpu: "10"
-    requests.memory: "16Gi"
-    limits.cpu: "20"
-    limits.memory: "32Gi"
-
-    # Storage
-    persistentvolumeclaims: "10"
-    requests.storage: "100Gi"
-
-    # Object counts
-    pods: "50"
-    services: "20"
-    configmaps: "30"
-    secrets: "20"
-
---
-# LimitRange sets default resource limits for pods in monitoring namespace
-apiVersion: v1
-kind: LimitRange
-metadata:
-  name: monitoring-limits
-  namespace: monitoring
-spec:
-  limits:
-  # Default container limits
-  - max:
-      cpu: "2"
-      memory: "4Gi"
-    min:
-      cpu: "10m"
-      memory: "16Mi"
-    default:
-      cpu: "500m"
-      memory: "512Mi"
-    defaultRequest:
-      cpu: "100m"
-      memory: "128Mi"
-    type: Container
-
-  # Pod limits
-  - max:
-      cpu: "4"
-      memory: "8Gi"
-    type: Pod
-
-  # PVC limits
-  - max:
-      storage: "50Gi"
-    min:
-      storage: "1Gi"
-    type: PersistentVolumeClaim
--- a/infrastructure/kubernetes/base/components/monitoring/ingress.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/ingress.yaml
@@ -1,42 +0,0 @@
---
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: monitoring-ingress
-  namespace: monitoring
-  annotations:
-    nginx.ingress.kubernetes.io/rewrite-target: /$2
-    nginx.ingress.kubernetes.io/ssl-redirect: "false"
-spec:
-  rules:
-  - host: monitoring.bakery-ia.local
-    http:
-      paths:
-      - path: /grafana(/|$)(.*)
-        pathType: ImplementationSpecific
-        backend:
-          service:
-            name: grafana
-            port:
-              number: 3000
-      - path: /prometheus(/|$)(.*)
-        pathType: ImplementationSpecific
-        backend:
-          service:
-            name: prometheus-external
-            port:
-              number: 9090
-      - path: /jaeger(/|$)(.*)
-        pathType: ImplementationSpecific
-        backend:
-          service:
-            name: jaeger-query
-            port:
-              number: 16686
-      - path: /alertmanager(/|$)(.*)
-        pathType: ImplementationSpecific
-        backend:
-          service:
-            name: alertmanager-external
-            port:
-              number: 9093
--- a/infrastructure/kubernetes/base/components/monitoring/jaeger.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/jaeger.yaml
@@ -1,190 +0,0 @@
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: jaeger
-  namespace: monitoring
-  labels:
-    app: jaeger
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: jaeger
-  template:
-    metadata:
-      labels:
-        app: jaeger
-    spec:
-      containers:
-      - name: jaeger
-        image: jaegertracing/all-in-one:1.51
-        env:
-        - name: COLLECTOR_ZIPKIN_HOST_PORT
-          value: ":9411"
-        - name: COLLECTOR_OTLP_ENABLED
-          value: "true"
-        - name: SPAN_STORAGE_TYPE
-          value: "badger"
-        - name: BADGER_EPHEMERAL
-          value: "false"
-        - name: BADGER_DIRECTORY_VALUE
-          value: "/badger/data"
-        - name: BADGER_DIRECTORY_KEY
-          value: "/badger/key"
-        ports:
-        - containerPort: 5775
-          protocol: UDP
-          name: zipkin-compact
-        - containerPort: 6831
-          protocol: UDP
-          name: jaeger-compact
-        - containerPort: 6832
-          protocol: UDP
-          name: jaeger-binary
-        - containerPort: 5778
-          protocol: TCP
-          name: config-rest
-        - containerPort: 16686
-          protocol: TCP
-          name: query
-        - containerPort: 14250
-          protocol: TCP
-          name: grpc
-        - containerPort: 14268
-          protocol: TCP
-          name: c-tchan-trft
-        - containerPort: 14269
-          protocol: TCP
-          name: admin-http
-        - containerPort: 9411
-          protocol: TCP
-          name: zipkin
-        - containerPort: 4317
-          protocol: TCP
-          name: otlp-grpc
-        - containerPort: 4318
-          protocol: TCP
-          name: otlp-http
-        volumeMounts:
-        - name: jaeger-storage
-          mountPath: /badger
-        resources:
-          requests:
-            memory: "512Mi"
-            cpu: "250m"
-          limits:
-            memory: "1Gi"
-            cpu: "500m"
-        livenessProbe:
-          httpGet:
-            path: /
-            port: 14269
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /
-            port: 14269
-          initialDelaySeconds: 5
-          periodSeconds: 5
-      volumes:
-      - name: jaeger-storage
-        persistentVolumeClaim:
-          claimName: jaeger-storage
-
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: jaeger-storage
-  namespace: monitoring
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 10Gi
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: jaeger-query
-  namespace: monitoring
-  labels:
-    app: jaeger
-spec:
-  type: ClusterIP
-  ports:
-  - port: 16686
-    targetPort: 16686
-    protocol: TCP
-    name: query
-  selector:
-    app: jaeger
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: jaeger-collector
-  namespace: monitoring
-  labels:
-    app: jaeger
-spec:
-  type: ClusterIP
-  ports:
-  - port: 14268
-    targetPort: 14268
-    protocol: TCP
-    name: c-tchan-trft
-  - port: 14250
-    targetPort: 14250
-    protocol: TCP
-    name: grpc
-  - port: 9411
-    targetPort: 9411
-    protocol: TCP
-    name: zipkin
-  - port: 4317
-    targetPort: 4317
-    protocol: TCP
-    name: otlp-grpc
-  - port: 4318
-    targetPort: 4318
-    protocol: TCP
-    name: otlp-http
-  selector:
-    app: jaeger
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: jaeger-agent
-  namespace: monitoring
-  labels:
-    app: jaeger
-spec:
-  type: ClusterIP
-  clusterIP: None
-  ports:
-  - port: 5775
-    targetPort: 5775
-    protocol: UDP
-    name: zipkin-compact
-  - port: 6831
-    targetPort: 6831
-    protocol: UDP
-    name: jaeger-compact
-  - port: 6832
-    targetPort: 6832
-    protocol: UDP
-    name: jaeger-binary
-  - port: 5778
-    targetPort: 5778
-    protocol: TCP
-    name: config-rest
-  selector:
-    app: jaeger
--- a/infrastructure/kubernetes/base/components/monitoring/kustomization.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/kustomization.yaml
@@ -1,18 +1,20 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization

+# Minimal Monitoring Infrastructure
+# SigNoz is now managed via Helm in the 'signoz' namespace
+# This kustomization only maintains:
+# - Namespace for legacy resources (if needed)
+# - Node exporter for infrastructure metrics
+# - PostgreSQL exporter for database metrics
+# - Optional OTEL collector (can be disabled if using SigNoz's built-in collector)
+
 resources:
  - namespace.yaml
  - secrets.yaml
-  - prometheus.yaml
-  - alert-rules.yaml
-  - alertmanager.yaml
-  - alertmanager-init.yaml
-  - grafana.yaml
-  - grafana-dashboards.yaml
-  - grafana-dashboards-extended.yaml
-  - postgres-exporter.yaml
+  # Exporters for metrics collection
  - node-exporter.yaml
-  - jaeger.yaml
-  - ha-policies.yaml
-  - ingress.yaml
+  - postgres-exporter.yaml
+  # Optional: Keep OTEL collector or use SigNoz's built-in one
+  # Uncomment if you want a dedicated OTEL collector in monitoring namespace
+  # - otel-collector.yaml
--- a/infrastructure/kubernetes/base/components/monitoring/otel-collector.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/otel-collector.yaml
@@ -0,0 +1,167 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: otel-collector-config
+  namespace: monitoring
+data:
+  otel-collector-config.yaml: |
+    extensions:
+      health_check:
+        endpoint: 0.0.0.0:13133
+
+    receivers:
+      otlp:
+        protocols:
+          grpc:
+            endpoint: 0.0.0.0:4317
+          http:
+            endpoint: 0.0.0.0:4318
+
+    processors:
+      batch:
+        timeout: 10s
+        send_batch_size: 1024
+
+      # Memory limiter to prevent OOM
+      memory_limiter:
+        check_interval: 1s
+        limit_mib: 512
+        spike_limit_mib: 128
+
+    exporters:
+      # Export metrics to Prometheus
+      prometheus:
+        endpoint: "0.0.0.0:8889"
+        namespace: otelcol
+        const_labels:
+          source: otel-collector
+
+      # Export to SigNoz
+      otlp/signoz:
+        endpoint: "signoz-query-service.monitoring.svc.cluster.local:8080"
+        tls:
+          insecure: true
+
+      # Logging exporter for debugging traces and logs
+      logging:
+        loglevel: info
+        sampling_initial: 5
+        sampling_thereafter: 200
+
+    service:
+      extensions: [health_check]
+      pipelines:
+        # Traces pipeline: receive -> process -> export to SigNoz
+        traces:
+          receivers: [otlp]
+          processors: [memory_limiter, batch]
+          exporters: [otlp/signoz, logging]
+
+        # Metrics pipeline: receive -> process -> export to both Prometheus and SigNoz
+        metrics:
+          receivers: [otlp]
+          processors: [memory_limiter, batch]
+          exporters: [prometheus, otlp/signoz]
+
+        # Logs pipeline: receive -> process -> export to SigNoz
+        logs:
+          receivers: [otlp]
+          processors: [memory_limiter, batch]
+          exporters: [otlp/signoz, logging]
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: otel-collector
+  namespace: monitoring
+  labels:
+    app: otel-collector
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: otel-collector
+  template:
+    metadata:
+      labels:
+        app: otel-collector
+    spec:
+      containers:
+      - name: otel-collector
+        image: otel/opentelemetry-collector-contrib:0.91.0
+        args:
+        - --config=/conf/otel-collector-config.yaml
+        ports:
+        - containerPort: 4317
+          protocol: TCP
+          name: otlp-grpc
+        - containerPort: 4318
+          protocol: TCP
+          name: otlp-http
+        - containerPort: 8889
+          protocol: TCP
+          name: prometheus
+        - containerPort: 13133
+          protocol: TCP
+          name: health-check
+        volumeMounts:
+        - name: otel-collector-config
+          mountPath: /conf
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "100m"
+          limits:
+            memory: "512Mi"
+            cpu: "500m"
+        livenessProbe:
+          httpGet:
+            path: /
+            port: 13133
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /
+            port: 13133
+          initialDelaySeconds: 5
+          periodSeconds: 5
+      volumes:
+      - name: otel-collector-config
+        configMap:
+          name: otel-collector-config
+          items:
+          - key: otel-collector-config.yaml
+            path: otel-collector-config.yaml
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: otel-collector
+  namespace: monitoring
+  labels:
+    app: otel-collector
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8889"
+    prometheus.io/path: "/metrics"
+spec:
+  type: ClusterIP
+  ports:
+  - port: 4317
+    targetPort: 4317
+    protocol: TCP
+    name: otlp-grpc
+  - port: 4318
+    targetPort: 4318
+    protocol: TCP
+    name: otlp-http
+  - port: 8889
+    targetPort: 8889
+    protocol: TCP
+    name: prometheus
+  selector:
+    app: otel-collector
--- a/infrastructure/kubernetes/base/components/monitoring/prometheus.yaml
+++ b/infrastructure/kubernetes/base/components/monitoring/prometheus.yaml
@@ -1,278 +0,0 @@
---
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: prometheus
-  namespace: monitoring
-
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
-  name: prometheus
-rules:
- apiGroups: [""]
-  resources:
-  - nodes
-  - nodes/proxy
-  - services
-  - endpoints
-  - pods
-  verbs: ["get", "list", "watch"]
- apiGroups:
-  - extensions
-  resources:
-  - ingresses
-  verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
-  verbs: ["get"]
-
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: prometheus
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: prometheus
-subjects:
- kind: ServiceAccount
-  name: prometheus
-  namespace: monitoring
-
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: prometheus-config
-  namespace: monitoring
-data:
-  prometheus.yml: |
-    global:
-      scrape_interval: 30s
-      evaluation_interval: 30s
-      external_labels:
-        cluster: 'bakery-ia'
-        environment: 'production'
-
-    # AlertManager configuration
-    alerting:
-      alertmanagers:
-      - static_configs:
-        - targets:
-          - alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093
-          - alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093
-          - alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093
-
-    # Load alert rules
-    rule_files:
-      - '/etc/prometheus/rules/*.yml'
-
-    scrape_configs:
-      # Scrape Prometheus itself
-      - job_name: 'prometheus'
-        static_configs:
-          - targets: ['localhost:9090']
-
-      # Scrape all bakery-ia services
-      - job_name: 'bakery-services'
-        kubernetes_sd_configs:
-          - role: pod
-            namespaces:
-              names:
-                - bakery-ia
-        relabel_configs:
-          # Only scrape pods with metrics port
-          - source_labels: [__meta_kubernetes_pod_container_port_name]
-            action: keep
-            regex: http
-
-          # Add service name label
-          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
-            target_label: service
-
-          # Add component label
-          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
-            target_label: component
-
-          # Add pod name
-          - source_labels: [__meta_kubernetes_pod_name]
-            target_label: pod
-
-          # Set metrics path
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
-            action: replace
-            target_label: __metrics_path__
-            regex: (.+)
-
-          # Set scrape port
-          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
-            action: replace
-            regex: ([^:]+)(?::\d+)?;(\d+)
-            replacement: $1:$2
-            target_label: __address__
-
-      # Scrape Kubernetes nodes
-      - job_name: 'kubernetes-nodes'
-        kubernetes_sd_configs:
-          - role: node
-        relabel_configs:
-          - action: labelmap
-            regex: __meta_kubernetes_node_label_(.+)
-          - target_label: __address__
-            replacement: kubernetes.default.svc:443
-          - source_labels: [__meta_kubernetes_node_name]
-            regex: (.+)
-            target_label: __metrics_path__
-            replacement: /api/v1/nodes/${1}/proxy/metrics
-
-      # Scrape AlertManager
-      - job_name: 'alertmanager'
-        static_configs:
-          - targets:
-            - alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093
-            - alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093
-            - alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093
-
-      # Scrape PostgreSQL exporter
-      - job_name: 'postgres-exporter'
-        static_configs:
-          - targets: ['postgres-exporter.monitoring.svc.cluster.local:9187']
-
-      # Scrape Node Exporter
-      - job_name: 'node-exporter'
-        kubernetes_sd_configs:
-          - role: node
-        relabel_configs:
-          - source_labels: [__address__]
-            regex: '(.*):10250'
-            replacement: '${1}:9100'
-            target_label: __address__
-          - source_labels: [__meta_kubernetes_node_name]
-            target_label: node
-
---
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
-  name: prometheus
-  namespace: monitoring
-  labels:
-    app: prometheus
-spec:
-  serviceName: prometheus
-  replicas: 2
-  selector:
-    matchLabels:
-      app: prometheus
-  template:
-    metadata:
-      labels:
-        app: prometheus
-    spec:
-      serviceAccountName: prometheus
-      affinity:
-        podAntiAffinity:
-          preferredDuringSchedulingIgnoredDuringExecution:
-          - weight: 100
-            podAffinityTerm:
-              labelSelector:
-                matchExpressions:
-                - key: app
-                  operator: In
-                  values:
-                  - prometheus
-              topologyKey: kubernetes.io/hostname
-      containers:
-      - name: prometheus
-        image: prom/prometheus:v3.0.1
-        args:
-          - '--config.file=/etc/prometheus/prometheus.yml'
-          - '--storage.tsdb.path=/prometheus'
-          - '--storage.tsdb.retention.time=30d'
-          - '--web.console.libraries=/usr/share/prometheus/console_libraries'
-          - '--web.console.templates=/usr/share/prometheus/consoles'
-          - '--web.enable-lifecycle'
-        ports:
-        - containerPort: 9090
-          name: web
-        volumeMounts:
-        - name: prometheus-config
-          mountPath: /etc/prometheus
-        - name: prometheus-rules
-          mountPath: /etc/prometheus/rules
-        - name: prometheus-storage
-          mountPath: /prometheus
-        resources:
-          requests:
-            memory: "1Gi"
-            cpu: "500m"
-          limits:
-            memory: "2Gi"
-            cpu: "1"
-        livenessProbe:
-          httpGet:
-            path: /-/healthy
-            port: 9090
-          initialDelaySeconds: 30
-          periodSeconds: 10
-        readinessProbe:
-          httpGet:
-            path: /-/ready
-            port: 9090
-          initialDelaySeconds: 5
-          periodSeconds: 5
-      volumes:
-      - name: prometheus-config
-        configMap:
-          name: prometheus-config
-      - name: prometheus-rules
-        configMap:
-          name: prometheus-alert-rules
-
-  volumeClaimTemplates:
-  - metadata:
-      name: prometheus-storage
-    spec:
-      accessModes: [ "ReadWriteOnce" ]
-      resources:
-        requests:
-          storage: 20Gi
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: prometheus
-  namespace: monitoring
-  labels:
-    app: prometheus
-spec:
-  type: ClusterIP
-  clusterIP: None
-  ports:
-  - port: 9090
-    targetPort: 9090
-    protocol: TCP
-    name: web
-  selector:
-    app: prometheus
-
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: prometheus-external
-  namespace: monitoring
-  labels:
-    app: prometheus
-spec:
-  type: ClusterIP
-  ports:
-  - port: 9090
-    targetPort: 9090
-    protocol: TCP
-    name: web
-  selector:
-    app: prometheus
--- a/infrastructure/kubernetes/base/configmap.yaml
+++ b/infrastructure/kubernetes/base/configmap.yaml
@@ -14,9 +14,10 @@ data:
  DEBUG: "false"
  LOG_LEVEL: "INFO"

-  # Observability Settings
-  # Set to "true" when Jaeger/monitoring stack is deployed
-  ENABLE_TRACING: "false"
+  # Observability Settings - SigNoz enabled
+  ENABLE_TRACING: "true"
+  ENABLE_METRICS: "true"
+  ENABLE_LOGS: "true"

  # Database initialization settings
  # IMPORTANT: Services NEVER run migrations - they only verify DB is ready
@@ -286,12 +287,11 @@ data:
  LOG_FILE_PATH: "/app/logs"
  LOG_ROTATION_SIZE: "100MB"
  LOG_RETENTION_DAYS: "30"
-  PROMETHEUS_ENABLED: "true"
-  PROMETHEUS_RETENTION: "200h"
  HEALTH_CHECK_TIMEOUT: "30"
  HEALTH_CHECK_INTERVAL: "30"
-  PROMETHEUS_RETENTION_DAYS: "30"
-  GRAFANA_ROOT_URL: "http://monitoring.bakery-ia.local/grafana"
+
+  # Monitoring Configuration - SigNoz
+  SIGNOZ_ROOT_URL: "http://localhost/signoz"

  # ================================================================
  # DATA COLLECTION SETTINGS
@@ -382,16 +382,20 @@ data:
  NOMINATIM_CPU_LIMIT: "4"

  # ================================================================
-  # DISTRIBUTED TRACING (Jaeger/OpenTelemetry)
+  # OBSERVABILITY - SigNoz (Unified Monitoring)
  # ================================================================
-  JAEGER_COLLECTOR_ENDPOINT: "http://jaeger-collector.monitoring:4317"
-  JAEGER_AGENT_HOST: "jaeger-agent.monitoring"
-  JAEGER_AGENT_PORT: "6831"
-  OTEL_EXPORTER_OTLP_ENDPOINT: "http://jaeger-collector.monitoring:4317"
+  # OpenTelemetry Configuration - Direct to SigNoz
+  OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.signoz.svc.cluster.local:4317"
+  OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
  OTEL_SERVICE_NAME: "bakery-ia"
+  OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
+
+  # SigNoz Endpoints
+  SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080"
+  SIGNOZ_FRONTEND_URL: "http://signoz-frontend.signoz.svc.cluster.local:3301"

  # ================================================================
- # REPLENISHMENT PLANNING SETTINGS
+  # REPLENISHMENT PLANNING SETTINGS
  # ================================================================
  REPLENISHMENT_PROJECTION_HORIZON_DAYS: "7"
  REPLENISHMENT_SERVICE_LEVEL: "0.95"
--- a/infrastructure/kubernetes/overlays/dev/kustomization.yaml
+++ b/infrastructure/kubernetes/overlays/dev/kustomization.yaml
@@ -9,11 +9,14 @@ metadata:

 resources:
  - ../../base
-  # Monitoring disabled for dev to save resources
-  # - ../../base/components/monitoring
+  # Monitoring enabled for dev environment
+  - ../../base/components/monitoring
  - dev-ingress.yaml
+  # SigNoz ingress is applied by Tilt (see Tiltfile)
+  # - signoz-ingress.yaml
  # Dev-Prod Parity: Enable HTTPS with self-signed certificates
  - dev-certificate.yaml
+  - monitoring-certificate.yaml
  - cluster-issuer-staging.yaml

 # Exclude nominatim from dev to save resources
@@ -608,6 +611,39 @@ patches:
          limits:
            memory: "512Mi"
            cpu: "300m"
+  # Optional exporters resource patches for dev
+  - target:
+      group: apps
+      version: v1
+      kind: DaemonSet
+      name: node-exporter
+      namespace: monitoring
+    patch: |-
+      - op: replace
+        path: /spec/template/spec/containers/0/resources
+        value:
+          requests:
+            memory: "32Mi"
+            cpu: "25m"
+          limits:
+            memory: "64Mi"
+            cpu: "100m"
+  - target:
+      group: apps
+      version: v1
+      kind: Deployment
+      name: postgres-exporter
+      namespace: monitoring
+    patch: |-
+      - op: replace
+        path: /spec/template/spec/containers/0/resources
+        value:
+          requests:
+            memory: "32Mi"
+            cpu: "25m"
+          limits:
+            memory: "64Mi"
+            cpu: "100m"

 secretGenerator:
  - name: dev-secrets
--- a/infrastructure/kubernetes/overlays/dev/monitoring-certificate.yaml
+++ b/infrastructure/kubernetes/overlays/dev/monitoring-certificate.yaml
@@ -0,0 +1,49 @@
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: bakery-dev-monitoring-tls-cert
+  namespace: monitoring
+spec:
+  # Self-signed certificate for local development
+  secretName: bakery-ia-tls-cert
+
+  # Certificate duration
+  duration: 2160h # 90 days
+  renewBefore: 360h # 15 days
+
+  # Subject configuration
+  subject:
+    organizations:
+      - Bakery IA Development
+
+  # Common name
+  commonName: localhost
+
+  # DNS names this certificate is valid for
+  dnsNames:
+    - localhost
+    - monitoring.bakery-ia.local
+
+  # IP addresses (for localhost)
+  ipAddresses:
+    - 127.0.0.1
+    - ::1
+
+  # Use self-signed issuer for development
+  issuerRef:
+    name: selfsigned-issuer
+    kind: ClusterIssuer
+    group: cert-manager.io
+
+  # Private key configuration
+  privateKey:
+    algorithm: RSA
+    encoding: PKCS1
+    size: 2048
+
+  # Usages
+  usages:
+    - server auth
+    - client auth
+    - digital signature
+    - key encipherment
--- a/infrastructure/kubernetes/overlays/dev/signoz-ingress.yaml
+++ b/infrastructure/kubernetes/overlays/dev/signoz-ingress.yaml
@@ -0,0 +1,39 @@
+---
+# SigNoz Ingress for Development (localhost)
+# SigNoz is deployed via Helm in the 'signoz' namespace
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: signoz-ingress-localhost
+  namespace: signoz
+  annotations:
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/rewrite-target: /$2
+    nginx.ingress.kubernetes.io/use-regex: "true"
+spec:
+  ingressClassName: nginx
+  tls:
+  - hosts:
+    - localhost
+    secretName: bakery-ia-tls-cert
+  rules:
+  - host: localhost
+    http:
+      paths:
+      # SigNoz Frontend UI
+      - path: /signoz(/|$)(.*)
+        pathType: ImplementationSpecific
+        backend:
+          service:
+            name: signoz-frontend
+            port:
+              number: 3301
+      # SigNoz Query Service API
+      - path: /signoz-api(/|$)(.*)
+        pathType: ImplementationSpecific
+        backend:
+          service:
+            name: signoz-query-service
+            port:
+              number: 8080
--- a/infrastructure/kubernetes/overlays/prod/kustomization.yaml
+++ b/infrastructure/kubernetes/overlays/prod/kustomization.yaml
@@ -14,6 +14,7 @@ resources:

 patchesStrategicMerge:
  - storage-patch.yaml
+  - monitoring-ingress-patch.yaml

 labels:
  - includeSelectors: true
@@ -21,6 +22,89 @@ labels:
      environment: production
      tier: production

+# SigNoz resource patches for production
+patches:
+  # SigNoz ClickHouse production configuration
+  - target:
+      group: apps
+      version: v1
+      kind: StatefulSet
+      name: signoz-clickhouse
+      namespace: signoz
+    patch: |-
+      - op: replace
+        path: /spec/replicas
+        value: 2
+      - op: replace
+        path: /spec/template/spec/containers/0/resources
+        value:
+          requests:
+            memory: "2Gi"
+            cpu: "500m"
+          limits:
+            memory: "4Gi"
+            cpu: "1000m"
+  # SigNoz Query Service production configuration
+  - target:
+      group: apps
+      version: v1
+      kind: Deployment
+      name: signoz-query-service
+      namespace: signoz
+    patch: |-
+      - op: replace
+        path: /spec/replicas
+        value: 2
+      - op: replace
+        path: /spec/template/spec/containers/0/resources
+        value:
+          requests:
+            memory: "1Gi"
+            cpu: "500m"
+          limits:
+            memory: "2Gi"
+            cpu: "1000m"
+  # SigNoz AlertManager production configuration
+  - target:
+      group: apps
+      version: v1
+      kind: Deployment
+      name: signoz-alertmanager
+      namespace: signoz
+    patch: |-
+      - op: replace
+        path: /spec/replicas
+        value: 2
+      - op: replace
+        path: /spec/template/spec/containers/0/resources
+        value:
+          requests:
+            memory: "512Mi"
+            cpu: "250m"
+          limits:
+            memory: "1Gi"
+            cpu: "500m"
+  # SigNoz Frontend production configuration
+  - target:
+      group: apps
+      version: v1
+      kind: Deployment
+      name: signoz-frontend
+      namespace: signoz
+    patch: |-
+      - op: replace
+        path: /spec/replicas
+        value: 2
+      - op: replace
+        path: /spec/template/spec/containers/0/resources
+        value:
+          requests:
+            memory: "512Mi"
+            cpu: "250m"
+          limits:
+            memory: "1Gi"
+            cpu: "500m"
+
 images:
  - name: bakery/auth-service
    newTag: latest
--- a/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml
+++ b/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml
@@ -17,14 +17,30 @@ data:
  REQUEST_TIMEOUT: "30"
  MAX_CONNECTIONS: "100"

-  # Monitoring
-  PROMETHEUS_ENABLED: "true"
+  # Monitoring - SigNoz (Unified Observability)
  ENABLE_TRACING: "true"
  ENABLE_METRICS: "true"
-  JAEGER_ENABLED: "true"
-  JAEGER_AGENT_HOST: "jaeger-agent.monitoring.svc.cluster.local"
-  JAEGER_AGENT_PORT: "6831"
+  ENABLE_LOGS: "true"
+
+  # OpenTelemetry Configuration - Direct to SigNoz
+  OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.signoz.svc.cluster.local:4317"
+  OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
+  OTEL_SERVICE_NAME: "bakery-ia"
+  OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=production,cluster.name=bakery-ia-prod"
+
+  # SigNoz Endpoints
+  SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080"
+  SIGNOZ_FRONTEND_URL: "https://monitoring.bakewise.ai/signoz"
+  SIGNOZ_ROOT_URL: "https://monitoring.bakewise.ai/signoz"

  # Rate Limiting (stricter in production)
  RATE_LIMIT_ENABLED: "true"
  RATE_LIMIT_PER_MINUTE: "60"
+
+  # CORS Configuration for Production
+  CORS_ORIGINS: "https://bakewise.ai"
+  CORS_ALLOW_CREDENTIALS: "true"
+
+  # Frontend Configuration
+  VITE_API_URL: "/api"
+  VITE_ENVIRONMENT: "production"
--- a/infrastructure/kubernetes/overlays/prod/prod-ingress.yaml
+++ b/infrastructure/kubernetes/overlays/prod/prod-ingress.yaml
@@ -16,7 +16,7 @@ metadata:

    # CORS configuration for production
    nginx.ingress.kubernetes.io/enable-cors: "true"
-    nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakery.yourdomain.com,https://api.yourdomain.com"
+    nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakewise.ai"
    nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS, PATCH"
    nginx.ingress.kubernetes.io/cors-allow-headers: "Content-Type, Authorization, X-Requested-With, Accept, Origin"
    nginx.ingress.kubernetes.io/cors-allow-credentials: "true"
@@ -40,12 +40,10 @@ spec:
  ingressClassName: nginx
  tls:
  - hosts:
-    - bakery.yourdomain.com
-    - api.yourdomain.com
-    - monitoring.yourdomain.com
+    - bakewise.ai
    secretName: bakery-ia-prod-tls-cert
  rules:
-  - host: bakery.yourdomain.com
+  - host: bakewise.ai
    http:
      paths:
      - path: /
@@ -55,7 +53,7 @@ spec:
            name: frontend-service
            port:
              number: 3000
-      - path: /api
+      - path: /api/v1
        pathType: Prefix
        backend:
          service:
@@ -63,31 +61,4 @@ spec:
            port:
              number: 8000

-  - host: api.yourdomain.com
-    http:
-      paths:
-      - path: /
-        pathType: Prefix
-        backend:
-          service:
-            name: gateway-service
-            port:
-              number: 8000
-
-  - host: monitoring.yourdomain.com
-    http:
-      paths:
-      - path: /grafana
-        pathType: Prefix
-        backend:
-          service:
-            name: grafana-service
-            port:
-              number: 3000
-      - path: /prometheus
-        pathType: Prefix
-        backend:
-          service:
-            name: prometheus-service
-            port:
-              number: 9090
+  # Monitoring (monitoring.bakewise.ai) is now handled by signoz-ingress.yaml in the signoz namespace
--- a/infrastructure/kubernetes/overlays/prod/signoz-ingress.yaml
+++ b/infrastructure/kubernetes/overlays/prod/signoz-ingress.yaml
@@ -0,0 +1,78 @@
+---
+# SigNoz Ingress for Production
+# SigNoz is deployed via Helm in the 'signoz' namespace
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: signoz-ingress-prod
+  namespace: signoz
+  labels:
+    app.kubernetes.io/name: signoz
+    app.kubernetes.io/component: ingress
+  annotations:
+    # Nginx ingress controller annotations
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/proxy-body-size: "50m"
+    nginx.ingress.kubernetes.io/proxy-connect-timeout: "600"
+    nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
+    nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
+    nginx.ingress.kubernetes.io/rewrite-target: /$2
+    nginx.ingress.kubernetes.io/use-regex: "true"
+
+    # CORS configuration
+    nginx.ingress.kubernetes.io/enable-cors: "true"
+    nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakewise.ai,https://monitoring.bakewise.ai"
+    nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS, PATCH"
+    nginx.ingress.kubernetes.io/cors-allow-headers: "Content-Type, Authorization, X-Requested-With, Accept, Origin"
+    nginx.ingress.kubernetes.io/cors-allow-credentials: "true"
+
+    # Security headers
+    nginx.ingress.kubernetes.io/configuration-snippet: |
+      more_set_headers "X-Frame-Options: SAMEORIGIN";
+      more_set_headers "X-Content-Type-Options: nosniff";
+      more_set_headers "X-XSS-Protection: 1; mode=block";
+      more_set_headers "Referrer-Policy: strict-origin-when-cross-origin";
+
+    # Rate limiting
+    nginx.ingress.kubernetes.io/limit-rps: "100"
+    nginx.ingress.kubernetes.io/limit-connections: "50"
+
+    # Cert-manager annotations for automatic certificate issuance
+    cert-manager.io/cluster-issuer: "letsencrypt-production"
+    cert-manager.io/acme-challenge-type: http01
+
+spec:
+  ingressClassName: nginx
+  tls:
+  - hosts:
+    - monitoring.bakewise.ai
+    secretName: signoz-prod-tls-cert
+  rules:
+  - host: monitoring.bakewise.ai
+    http:
+      paths:
+      # SigNoz Frontend UI
+      - path: /signoz(/|$)(.*)
+        pathType: ImplementationSpecific
+        backend:
+          service:
+            name: signoz-frontend
+            port:
+              number: 3301
+      # SigNoz Query Service API
+      - path: /signoz-api(/|$)(.*)
+        pathType: ImplementationSpecific
+        backend:
+          service:
+            name: signoz-query-service
+            port:
+              number: 8080
+      # SigNoz AlertManager
+      - path: /signoz-alerts(/|$)(.*)
+        pathType: ImplementationSpecific
+        backend:
+          service:
+            name: signoz-alertmanager
+            port:
+              number: 9093
--- a/infrastructure/kubernetes/signoz-values.yaml
+++ b/infrastructure/kubernetes/signoz-values.yaml
@@ -0,0 +1,79 @@
+# SigNoz Helm Chart Values - Customized for Bakery IA
+# https://github.com/SigNoz/charts
+
+# Global settings
+global:
+  storageClass: "standard"
+
+# Frontend configuration
+frontend:
+  service:
+    type: ClusterIP
+    port: 3301
+  ingress:
+    enabled: true
+    hosts:
+      - host: localhost
+        paths:
+          - path: /signoz
+            pathType: Prefix
+    annotations:
+      nginx.ingress.kubernetes.io/rewrite-target: /$2
+
+# Query Service configuration
+queryService:
+  replicaCount: 1
+  resources:
+    requests:
+      cpu: 100m
+      memory: 256Mi
+    limits:
+      cpu: 200m
+      memory: 512Mi
+
+# AlertManager configuration
+alertmanager:
+  replicaCount: 1
+  resources:
+    requests:
+      cpu: 50m
+      memory: 128Mi
+    limits:
+      cpu: 100m
+      memory: 256Mi
+
+# ClickHouse configuration
+clickhouse:
+  persistence:
+    enabled: true
+    size: 10Gi
+  resources:
+    requests:
+      cpu: 500m
+      memory: 1Gi
+    limits:
+      cpu: 1000m
+      memory: 2Gi
+
+# OpenTelemetry Collector configuration
+otelCollector:
+  enabled: true
+  config:
+    exporters:
+      otlp:
+        endpoint: "signoz-query-service:8080"
+    service:
+      pipelines:
+        traces:
+          receivers: [otlp]
+          exporters: [otlp]
+        metrics:
+          receivers: [otlp]
+          exporters: [otlp]
+        logs:
+          receivers: [otlp]
+          exporters: [otlp]
+
+# Resource optimization for development
+# These can be increased for production
+development: true
--- a/kubernetes_restart.sh
+++ b/kubernetes_restart.sh
@@ -228,6 +228,12 @@ setup() {

    if [ $? -eq 0 ]; then
        print_success "Colima started successfully"
+        
+        # Increase inotify limits for Colima to prevent "too many open files" errors
+        print_status "Increasing inotify limits in Colima VM..."
+        colima ssh --profile k8s-local "sudo sysctl -w fs.inotify.max_user_watches=524288"
+        colima ssh --profile k8s-local "sudo sysctl -w fs.inotify.max_user_instances=512"
+        print_success "Inotify limits increased"
    else
        print_error "Failed to start Colima"
        exit 1
@@ -261,23 +267,23 @@ setup() {

    # 4. Connect registry to Kind network
    connect_registry_to_kind
-    
-    # 3. Install NGINX Ingress Controller
+
+    # 5. Install NGINX Ingress Controller
    print_status "Installing NGINX Ingress Controller..."
-    
+
    # Apply the ingress-nginx manifest
    kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml
-    
+
    if [ $? -eq 0 ]; then
        print_success "NGINX Ingress Controller manifest applied"
    else
        print_error "Failed to apply NGINX Ingress Controller manifest"
        exit 1
    fi
-    
+
    # Wait for ingress-nginx pods to be ready with retry logic
    wait_for_pods "ingress-nginx" "app.kubernetes.io/component=controller" 300
-    
+
    if [ $? -ne 0 ]; then
        print_error "NGINX Ingress Controller failed to become ready"
        print_status "Checking pod status for debugging..."
@@ -285,30 +291,10 @@ setup() {
        kubectl describe pods -n ingress-nginx
        exit 1
    fi
-    
-    # 4. Configure permanent localhost access
-    print_status "Configuring localhost access via NodePort..."
-    
-    # Check if service exists
-    if kubectl get svc ingress-nginx-controller -n ingress-nginx &>/dev/null; then
-        # Patch the service to expose NodePorts
-        kubectl patch svc ingress-nginx-controller \
-            -n ingress-nginx \
-            --type merge \
-            -p '{"spec":{"type":"NodePort","ports":[{"name":"http","port":80,"targetPort":"http","nodePort":30080},{"name":"https","port":443,"targetPort":"https","nodePort":30443}]}}'
-        
-        if [ $? -eq 0 ]; then
-            print_success "NodePort configuration applied"
-        else
-            print_error "Failed to patch Ingress service"
-            exit 1
-        fi
-    else
-        print_error "Ingress NGINX controller service not found"
-        exit 1
-    fi
-    
-    # 5. Verify port mappings from kind-config.yaml
+
+    print_success "NGINX Ingress Controller ready (using Kind's built-in NodePort configuration)"
+
+    # 6. Verify port mappings from kind-config.yaml
    print_status "Verifying port mappings from configuration..."
    
    # Extract ports from kind-config.yaml
@@ -323,24 +309,24 @@ setup() {
    echo "  - Colima profile: k8s-local"
    echo "  - Kind cluster: $CLUSTER_NAME"
    echo "  - Local registry: localhost:5001"
-    echo "  - Direct port mappings (from kind-config.yaml):"
-    echo "      Frontend: localhost:3000 -> container:30300"
-    echo "      Gateway: localhost:8000 -> container:30800"
-    echo "  - Ingress access:"
-    echo "      HTTP: localhost:${HTTP_HOST_PORT} -> ingress:30080"
-    echo "      HTTPS: localhost:${HTTPS_HOST_PORT} -> ingress:30443"
-    echo "  - NodePort access:"
-    echo "      HTTP: localhost:30080"
-    echo "      HTTPS: localhost:30443"
-    echo "----------------------------------------"
-    print_status "To access your applications:"
-    echo "  - Use Ingress via: http://localhost:${HTTP_HOST_PORT}"
-    echo "  - Direct NodePort: http://localhost:30080"
+    echo ""
+    print_status "Port Mappings (configured in kind-config.yaml):"
+    echo "  - HTTP Ingress: localhost:${HTTP_HOST_PORT} -> Kind NodePort 30080"
+    echo "  - HTTPS Ingress: localhost:${HTTPS_HOST_PORT} -> Kind NodePort 30443"
+    echo "  - Frontend Direct: localhost:3000 -> container:30300"
+    echo "  - Gateway Direct: localhost:8000 -> container:30800"
+    echo ""
+    print_status "How to access your application:"
+    echo "  1. Start Tilt: tilt up"
+    echo "  2. Access via:"
+    echo "     - Ingress: http://localhost (or https://localhost)"
+    echo "     - Direct: http://localhost:3000 (frontend), http://localhost:8000 (gateway)"
+    echo "     - Tilt UI: http://localhost:10350"
    echo "----------------------------------------"
    print_status "Local Registry Information:"
    echo "  - Registry URL: localhost:5001"
-    echo "  - Images will be pushed to: localhost:5001/bakery/<service>"
-    echo "  - Update your Tiltfile with: default_registry('localhost:5001')"
+    echo "  - Images pushed to: localhost:5001/bakery/<service>"
+    echo "  - Tiltfile already configured: default_registry('localhost:5001')"
    echo "----------------------------------------"
 }

--- a/services/ai_insights/app/main.py
+++ b/services/ai_insights/app/main.py
@@ -1,22 +1,50 @@
 """Main FastAPI application for AI Insights Service."""

-from fastapi import FastAPI
+from fastapi import FastAPI, Response
 from fastapi.middleware.cors import CORSMiddleware
 from contextlib import asynccontextmanager
 import structlog
+import os

 from app.core.config import settings
 from app.core.database import init_db, close_db
 from app.api import insights
+from shared.monitoring.logging import setup_logging
+from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware

-# Configure structured logging
-structlog.configure(
-    processors=[
-        structlog.processors.TimeStamper(fmt="iso"),
-        structlog.processors.JSONRenderer()
-    ]
-)
+# OpenTelemetry imports
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
+from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
+from opentelemetry.instrumentation.redis import RedisInstrumentor
+from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
+from opentelemetry.sdk.resources import Resource

+# Configure OpenTelemetry tracing
+def setup_tracing(service_name: str = "ai-insights"):
+    """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
+    resource = Resource.create({"service.name": service_name})
+
+    otlp_exporter = OTLPSpanExporter(
+        endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"),
+        insecure=True
+    )
+
+    provider = TracerProvider(resource=resource)
+    processor = BatchSpanProcessor(otlp_exporter)
+    provider.add_span_processor(processor)
+    trace.set_tracer_provider(provider)
+
+    return provider
+
+# Initialize tracing
+tracer_provider = setup_tracing("ai-insights")
+
+# Setup logging
+setup_logging("ai-insights", getattr(settings, 'LOG_LEVEL', 'INFO'))
 logger = structlog.get_logger()


@@ -28,6 +56,10 @@ async def lifespan(app: FastAPI):
    await init_db()
    logger.info("Database initialized")

+    # Start metrics server
+    metrics_collector.start_metrics_server(8080)
+    logger.info("Metrics server started on port 8080")
+
    yield

    # Shutdown
@@ -44,6 +76,24 @@ app = FastAPI(
    lifespan=lifespan
 )

+# Instrument FastAPI with OpenTelemetry
+FastAPIInstrumentor.instrument_app(app)
+
+# Instrument httpx for outgoing requests
+HTTPXClientInstrumentor().instrument()
+
+# Instrument Redis
+RedisInstrumentor().instrument()
+
+# Instrument SQLAlchemy
+SQLAlchemyInstrumentor().instrument()
+
+# Initialize metrics collector
+metrics_collector = MetricsCollector("ai-insights")
+
+# Add metrics middleware to track HTTP requests
+add_metrics_middleware(app, metrics_collector)
+
 # CORS middleware
 app.add_middleware(
    CORSMiddleware,
@@ -81,6 +131,15 @@ async def health_check():
    }


+@app.get("/metrics")
+async def metrics():
+    """Prometheus metrics endpoint"""
+    return Response(
+        content=metrics_collector.get_metrics(),
+        media_type="text/plain; version=0.0.4; charset=utf-8"
+    )
+
+
 if __name__ == "__main__":
    import uvicorn

--- a/services/ai_insights/requirements.txt
+++ b/services/ai_insights/requirements.txt
@@ -29,6 +29,16 @@ pytz==2023.3
 # Logging
 structlog==23.2.0

+# Monitoring and Observability
+prometheus-client==0.23.1
+opentelemetry-api==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-instrumentation-fastapi==0.48b0
+opentelemetry-exporter-otlp-proto-grpc==1.27.0
+opentelemetry-instrumentation-httpx==0.48b0
+opentelemetry-instrumentation-redis==0.48b0
+opentelemetry-instrumentation-sqlalchemy==0.48b0
+
 # Machine Learning (for confidence scoring and impact estimation)
 numpy==1.26.2
 pandas==2.1.3
--- a/services/alert_processor/app/main.py
+++ b/services/alert_processor/app/main.py
@@ -4,25 +4,52 @@ Alert Processor Service v2.0
 Main FastAPI application with RabbitMQ consumer lifecycle management.
 """

-from fastapi import FastAPI
+from fastapi import FastAPI, Response
 from fastapi.middleware.cors import CORSMiddleware
 from contextlib import asynccontextmanager
 import structlog
+import os

 from app.core.config import settings
 from app.consumer.event_consumer import EventConsumer
 from app.api import alerts, sse
 from shared.redis_utils import initialize_redis, close_redis
+from shared.monitoring.logging import setup_logging
+from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware

-# Configure structured logging
-structlog.configure(
-    processors=[
-        structlog.processors.TimeStamper(fmt="iso"),
-        structlog.processors.add_log_level,
-        structlog.processors.JSONRenderer()
-    ]
-)
+# OpenTelemetry imports
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
+from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
+from opentelemetry.instrumentation.redis import RedisInstrumentor
+from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
+from opentelemetry.sdk.resources import Resource

+# Configure OpenTelemetry tracing
+def setup_tracing(service_name: str = "alert-processor"):
+    """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
+    resource = Resource.create({"service.name": service_name})
+
+    otlp_exporter = OTLPSpanExporter(
+        endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"),
+        insecure=True
+    )
+
+    provider = TracerProvider(resource=resource)
+    processor = BatchSpanProcessor(otlp_exporter)
+    provider.add_span_processor(processor)
+    trace.set_tracer_provider(provider)
+
+    return provider
+
+# Initialize tracing
+tracer_provider = setup_tracing("alert-processor")
+
+# Setup logging
+setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO'))
 logger = structlog.get_logger()

 # Global consumer instance
@@ -54,6 +81,10 @@ async def lifespan(app: FastAPI):
        consumer = EventConsumer()
        await consumer.start()
        logger.info("alert_processor_started")
+
+        # Start metrics server
+        metrics_collector.start_metrics_server(8080)
+        logger.info("Metrics server started on port 8080")
    except Exception as e:
        logger.error("alert_processor_startup_failed", error=str(e))
        raise
@@ -79,6 +110,24 @@ app = FastAPI(
    debug=settings.DEBUG
 )

+# Instrument FastAPI with OpenTelemetry
+FastAPIInstrumentor.instrument_app(app)
+
+# Instrument httpx for outgoing requests
+HTTPXClientInstrumentor().instrument()
+
+# Instrument Redis
+RedisInstrumentor().instrument()
+
+# Instrument SQLAlchemy
+SQLAlchemyInstrumentor().instrument()
+
+# Initialize metrics collector
+metrics_collector = MetricsCollector("alert-processor")
+
+# Add metrics middleware to track HTTP requests
+add_metrics_middleware(app, metrics_collector)
+
 # CORS middleware
 app.add_middleware(
    CORSMiddleware,
@@ -126,6 +175,15 @@ async def root():
    }


+@app.get("/metrics")
+async def metrics():
+    """Prometheus metrics endpoint"""
+    return Response(
+        content=metrics_collector.get_metrics(),
+        media_type="text/plain; version=0.0.4; charset=utf-8"
+    )
+
+
 if __name__ == "__main__":
    import uvicorn

--- a/services/alert_processor/requirements.txt
+++ b/services/alert_processor/requirements.txt
@@ -32,3 +32,13 @@ python-dateutil==2.8.2

 # Authentication
 python-jose[cryptography]==3.3.0
+
+# Monitoring and Observability
+prometheus-client==0.23.1
+opentelemetry-api==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-instrumentation-fastapi==0.48b0
+opentelemetry-exporter-otlp-proto-grpc==1.27.0
+opentelemetry-instrumentation-httpx==0.48b0
+opentelemetry-instrumentation-redis==0.48b0
+opentelemetry-instrumentation-sqlalchemy==0.48b0
--- a/services/demo_session/app/main.py
+++ b/services/demo_session/app/main.py
@@ -3,16 +3,51 @@ Demo Session Service - Main Application
 Manages isolated demo sessions with ephemeral data
 """

-from fastapi import FastAPI, Request
+from fastapi import FastAPI, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 import structlog
 from contextlib import asynccontextmanager
+import os

 from app.core import settings, DatabaseManager
 from app.api import demo_sessions, demo_accounts, demo_operations, internal
 from shared.redis_utils import initialize_redis, close_redis
+from shared.monitoring.logging import setup_logging
+from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware

+# OpenTelemetry imports
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
+from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
+from opentelemetry.instrumentation.redis import RedisInstrumentor
+from opentelemetry.sdk.resources import Resource
+
+# Configure OpenTelemetry tracing
+def setup_tracing(service_name: str = "demo-session"):
+    """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
+    resource = Resource.create({"service.name": service_name})
+
+    otlp_exporter = OTLPSpanExporter(
+        endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"),
+        insecure=True
+    )
+
+    provider = TracerProvider(resource=resource)
+    processor = BatchSpanProcessor(otlp_exporter)
+    provider.add_span_processor(processor)
+    trace.set_tracer_provider(provider)
+
+    return provider
+
+# Initialize tracing
+tracer_provider = setup_tracing("demo-session")
+
+# Setup logging
+setup_logging("demo-session", getattr(settings, 'LOG_LEVEL', 'INFO'))
 logger = structlog.get_logger()

 # Initialize database
@@ -34,6 +69,10 @@ async def lifespan(app: FastAPI):
        max_connections=50
    )

+    # Start metrics server
+    metrics_collector.start_metrics_server(8080)
+    logger.info("Metrics server started on port 8080")
+
    logger.info("Demo Session Service started successfully")

    yield
@@ -52,6 +91,21 @@ app = FastAPI(
    lifespan=lifespan
 )

+# Instrument FastAPI with OpenTelemetry
+FastAPIInstrumentor.instrument_app(app)
+
+# Instrument httpx for outgoing requests
+HTTPXClientInstrumentor().instrument()
+
+# Instrument Redis
+RedisInstrumentor().instrument()
+
+# Initialize metrics collector
+metrics_collector = MetricsCollector("demo-session")
+
+# Add metrics middleware to track HTTP requests
+add_metrics_middleware(app, metrics_collector)
+
 # CORS middleware
 app.add_middleware(
    CORSMiddleware,
@@ -110,6 +164,15 @@ async def health():
    }


+@app.get("/metrics")
+async def metrics():
+    """Prometheus metrics endpoint"""
+    return Response(
+        content=metrics_collector.get_metrics(),
+        media_type="text/plain; version=0.0.4; charset=utf-8"
+    )
+
+
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(
--- a/services/demo_session/requirements.txt
+++ b/services/demo_session/requirements.txt
@@ -18,3 +18,11 @@ prometheus-client==0.23.1
 aio-pika==9.4.3
 email-validator==2.2.0
 pytz==2024.2
+
+# OpenTelemetry for distributed tracing
+opentelemetry-api==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-instrumentation-fastapi==0.48b0
+opentelemetry-exporter-otlp-proto-grpc==1.27.0
+opentelemetry-instrumentation-httpx==0.48b0
+opentelemetry-instrumentation-redis==0.48b0