diff --git a/Tiltfile b/Tiltfile index eaa6018e..df53524e 100644 --- a/Tiltfile +++ b/Tiltfile @@ -36,6 +36,11 @@ Security Features: โœ… pgcrypto extension for encryption โœ… PostgreSQL audit logging +Monitoring: + ๐Ÿ“Š Service metrics available at /metrics endpoints + ๐Ÿ” Telemetry ready (traces, metrics, logs) + โ„น๏ธ SigNoz deployment optional for local dev (see signoz-info resource) + Applying security configurations... """) @@ -303,82 +308,131 @@ k8s_resource('redis', resource_deps=['security-setup'], labels=['01-infrastructu k8s_resource('rabbitmq', labels=['01-infrastructure']) k8s_resource('nominatim', labels=['01-infrastructure']) +# ============================================================================= +# MONITORING RESOURCES - SigNoz (Unified Observability) +# ============================================================================= + +# Note: SigNoz Helm chart is complex for local dev +# For development, access SigNoz manually or use production Helm deployment +# To deploy SigNoz manually: ./infrastructure/helm/deploy-signoz.sh dev +local_resource( + 'signoz-info', + cmd=''' + echo "๐Ÿ“Š SigNoz Monitoring Information" + echo "" + echo "SigNoz Helm deployment is disabled for local development due to complexity." + echo "" + echo "Options:" + echo "1. Deploy manually: ./infrastructure/helm/deploy-signoz.sh dev" + echo "2. Use production deployment: ./infrastructure/helm/deploy-signoz.sh prod" + echo "3. Skip monitoring for local development (use application metrics only)" + echo "" + echo "For simpler local monitoring, consider using just Prometheus+Grafana" + echo "or access metrics directly from services at /metrics endpoints." + ''', + labels=['05-monitoring'], + auto_init=False, + trigger_mode=TRIGGER_MODE_MANUAL +) + +# SigNoz ingress (only if manually deployed) +# Uncomment and trigger manually if you deploy SigNoz +# local_resource( +# 'signoz-ingress', +# cmd=''' +# echo "๐ŸŒ Applying SigNoz ingress..." +# kubectl apply -f infrastructure/kubernetes/overlays/dev/signoz-ingress.yaml +# echo "โœ… SigNoz ingress configured" +# ''', +# labels=['05-monitoring'], +# auto_init=False, +# trigger_mode=TRIGGER_MODE_MANUAL +# ) + +# Note: SigNoz components are managed by Helm and deployed outside of kustomize +# They will appear automatically once deployed, but we don't track them explicitly in Tilt +# to avoid startup errors. View them with: kubectl get pods -n signoz + +# Optional exporters (in monitoring namespace) +k8s_resource('node-exporter', labels=['05-monitoring']) +k8s_resource('postgres-exporter', resource_deps=['auth-db'], labels=['05-monitoring']) + # ============================================================================= # DATABASE RESOURCES # ============================================================================= # Core Service Databases -k8s_resource('auth-db', resource_deps=['security-setup'], labels=['02-databases']) -k8s_resource('tenant-db', resource_deps=['security-setup'], labels=['02-databases']) +k8s_resource('auth-db', resource_deps=['security-setup'], labels=['06-databases']) +k8s_resource('tenant-db', resource_deps=['security-setup'], labels=['06-databases']) # Data & Analytics Databases -k8s_resource('training-db', resource_deps=['security-setup'], labels=['02-databases']) -k8s_resource('forecasting-db', resource_deps=['security-setup'], labels=['02-databases']) -k8s_resource('ai-insights-db', resource_deps=['security-setup'], labels=['02-databases']) +k8s_resource('training-db', resource_deps=['security-setup'], labels=['06-databases']) +k8s_resource('forecasting-db', resource_deps=['security-setup'], labels=['06-databases']) +k8s_resource('ai-insights-db', resource_deps=['security-setup'], labels=['06-databases']) # Operations Databases -k8s_resource('sales-db', resource_deps=['security-setup'], labels=['02-databases']) -k8s_resource('inventory-db', resource_deps=['security-setup'], labels=['02-databases']) -k8s_resource('production-db', resource_deps=['security-setup'], labels=['02-databases']) -k8s_resource('procurement-db', resource_deps=['security-setup'], labels=['02-databases']) -k8s_resource('distribution-db', resource_deps=['security-setup'], labels=['02-databases']) +k8s_resource('sales-db', resource_deps=['security-setup'], labels=['06-databases']) +k8s_resource('inventory-db', resource_deps=['security-setup'], labels=['06-databases']) +k8s_resource('production-db', resource_deps=['security-setup'], labels=['06-databases']) +k8s_resource('procurement-db', resource_deps=['security-setup'], labels=['06-databases']) +k8s_resource('distribution-db', resource_deps=['security-setup'], labels=['06-databases']) # Supporting Service Databases -k8s_resource('recipes-db', resource_deps=['security-setup'], labels=['02-databases']) -k8s_resource('suppliers-db', resource_deps=['security-setup'], labels=['02-databases']) -k8s_resource('pos-db', resource_deps=['security-setup'], labels=['02-databases']) -k8s_resource('orders-db', resource_deps=['security-setup'], labels=['02-databases']) -k8s_resource('external-db', resource_deps=['security-setup'], labels=['02-databases']) +k8s_resource('recipes-db', resource_deps=['security-setup'], labels=['06-databases']) +k8s_resource('suppliers-db', resource_deps=['security-setup'], labels=['06-databases']) +k8s_resource('pos-db', resource_deps=['security-setup'], labels=['06-databases']) +k8s_resource('orders-db', resource_deps=['security-setup'], labels=['06-databases']) +k8s_resource('external-db', resource_deps=['security-setup'], labels=['06-databases']) # Platform Service Databases -k8s_resource('notification-db', resource_deps=['security-setup'], labels=['02-databases']) -k8s_resource('alert-processor-db', resource_deps=['security-setup'], labels=['02-databases']) -k8s_resource('orchestrator-db', resource_deps=['security-setup'], labels=['02-databases']) +k8s_resource('notification-db', resource_deps=['security-setup'], labels=['06-databases']) +k8s_resource('alert-processor-db', resource_deps=['security-setup'], labels=['06-databases']) +k8s_resource('orchestrator-db', resource_deps=['security-setup'], labels=['06-databases']) # Demo Service Databases -k8s_resource('demo-session-db', resource_deps=['security-setup'], labels=['02-databases']) +k8s_resource('demo-session-db', resource_deps=['security-setup'], labels=['06-databases']) # ============================================================================= # MIGRATION JOBS # ============================================================================= # Core Service Migrations -k8s_resource('auth-migration', resource_deps=['auth-db'], labels=['03-migrations']) -k8s_resource('tenant-migration', resource_deps=['tenant-db'], labels=['03-migrations']) +k8s_resource('auth-migration', resource_deps=['auth-db'], labels=['07-migrations']) +k8s_resource('tenant-migration', resource_deps=['tenant-db'], labels=['07-migrations']) # Data & Analytics Migrations -k8s_resource('training-migration', resource_deps=['training-db'], labels=['03-migrations']) -k8s_resource('forecasting-migration', resource_deps=['forecasting-db'], labels=['03-migrations']) -k8s_resource('ai-insights-migration', resource_deps=['ai-insights-db'], labels=['03-migrations']) +k8s_resource('training-migration', resource_deps=['training-db'], labels=['07-migrations']) +k8s_resource('forecasting-migration', resource_deps=['forecasting-db'], labels=['07-migrations']) +k8s_resource('ai-insights-migration', resource_deps=['ai-insights-db'], labels=['07-migrations']) # Operations Migrations -k8s_resource('sales-migration', resource_deps=['sales-db'], labels=['03-migrations']) -k8s_resource('inventory-migration', resource_deps=['inventory-db'], labels=['03-migrations']) -k8s_resource('production-migration', resource_deps=['production-db'], labels=['03-migrations']) -k8s_resource('procurement-migration', resource_deps=['procurement-db'], labels=['03-migrations']) -k8s_resource('distribution-migration', resource_deps=['distribution-db'], labels=['03-migrations']) +k8s_resource('sales-migration', resource_deps=['sales-db'], labels=['07-migrations']) +k8s_resource('inventory-migration', resource_deps=['inventory-db'], labels=['07-migrations']) +k8s_resource('production-migration', resource_deps=['production-db'], labels=['07-migrations']) +k8s_resource('procurement-migration', resource_deps=['procurement-db'], labels=['07-migrations']) +k8s_resource('distribution-migration', resource_deps=['distribution-db'], labels=['07-migrations']) # Supporting Service Migrations -k8s_resource('recipes-migration', resource_deps=['recipes-db'], labels=['03-migrations']) -k8s_resource('suppliers-migration', resource_deps=['suppliers-db'], labels=['03-migrations']) -k8s_resource('pos-migration', resource_deps=['pos-db'], labels=['03-migrations']) -k8s_resource('orders-migration', resource_deps=['orders-db'], labels=['03-migrations']) -k8s_resource('external-migration', resource_deps=['external-db'], labels=['03-migrations']) +k8s_resource('recipes-migration', resource_deps=['recipes-db'], labels=['07-migrations']) +k8s_resource('suppliers-migration', resource_deps=['suppliers-db'], labels=['07-migrations']) +k8s_resource('pos-migration', resource_deps=['pos-db'], labels=['07-migrations']) +k8s_resource('orders-migration', resource_deps=['orders-db'], labels=['07-migrations']) +k8s_resource('external-migration', resource_deps=['external-db'], labels=['07-migrations']) # Platform Service Migrations -k8s_resource('notification-migration', resource_deps=['notification-db'], labels=['03-migrations']) -k8s_resource('alert-processor-migration', resource_deps=['alert-processor-db'], labels=['03-migrations']) -k8s_resource('orchestrator-migration', resource_deps=['orchestrator-db'], labels=['03-migrations']) +k8s_resource('notification-migration', resource_deps=['notification-db'], labels=['07-migrations']) +k8s_resource('alert-processor-migration', resource_deps=['alert-processor-db'], labels=['07-migrations']) +k8s_resource('orchestrator-migration', resource_deps=['orchestrator-db'], labels=['07-migrations']) # Demo Service Migrations -k8s_resource('demo-session-migration', resource_deps=['demo-session-db'], labels=['03-migrations']) +k8s_resource('demo-session-migration', resource_deps=['demo-session-db'], labels=['07-migrations']) # ============================================================================= # DATA INITIALIZATION JOBS # ============================================================================= -k8s_resource('external-data-init', resource_deps=['external-migration', 'redis'], labels=['04-data-init']) -k8s_resource('nominatim-init', labels=['04-data-init']) +k8s_resource('external-data-init', resource_deps=['external-migration', 'redis'], labels=['08-data-init']) +k8s_resource('nominatim-init', labels=['08-data-init']) # ============================================================================= # ============================================================================= @@ -517,8 +571,16 @@ Internal Schedulers Active: โฐ Usage Tracking: Daily @ 2:00 AM UTC (tenant-service) Access your application: - Frontend: http://localhost:3000 (or via ingress) - Gateway: http://localhost:8000 (or via ingress) + Main Application: https://localhost + API Endpoints: https://localhost/api/v1/... + + Service Metrics: + Gateway: http://localhost:8000/metrics + Any Service: kubectl port-forward 8000:8000 + + SigNoz (Optional - see SIGNOZ_DEPLOYMENT_RECOMMENDATIONS.md): + Deploy manually: ./infrastructure/helm/deploy-signoz.sh dev + Access (if deployed): https://localhost/signoz Verify security: kubectl get pvc -n bakery-ia diff --git a/docs/MONITORING_DEPLOYMENT_SUMMARY.md b/docs/MONITORING_DEPLOYMENT_SUMMARY.md deleted file mode 100644 index 0f194b01..00000000 --- a/docs/MONITORING_DEPLOYMENT_SUMMARY.md +++ /dev/null @@ -1,459 +0,0 @@ -# ๐ŸŽ‰ Production Monitoring MVP - Implementation Complete - -**Date:** 2026-01-07 -**Status:** โœ… READY FOR PRODUCTION DEPLOYMENT - ---- - -## ๐Ÿ“Š What Was Implemented - -### **Phase 1: Core Infrastructure** โœ… -- โœ… **Prometheus v3.0.1** (2 replicas, HA mode with StatefulSet) -- โœ… **AlertManager v0.27.0** (3 replicas, clustered with gossip protocol) -- โœ… **Grafana v12.3.0** (secure credentials via Kubernetes Secrets) -- โœ… **PostgreSQL Exporter v0.15.0** (database health monitoring) -- โœ… **Node Exporter v1.7.0** (infrastructure monitoring via DaemonSet) -- โœ… **Jaeger v1.51** (distributed tracing with persistent storage) - -### **Phase 2: Alert Management** โœ… -- โœ… **50+ Alert Rules** across 9 categories: - - Service health & performance - - Business logic (ML training, API limits) - - Alert system health & performance - - Database & infrastructure alerts - - Monitoring self-monitoring -- โœ… **Intelligent Alert Routing** by severity, component, and service -- โœ… **Alert Inhibition Rules** to prevent alert storms -- โœ… **Multi-Channel Notifications** (email + Slack support) - -### **Phase 3: High Availability** โœ… -- โœ… **PodDisruptionBudgets** for all monitoring components -- โœ… **Anti-affinity Rules** to spread pods across nodes -- โœ… **ResourceQuota & LimitRange** for namespace resource management -- โœ… **StatefulSets** with volumeClaimTemplates for persistent storage -- โœ… **Headless Services** for StatefulSet DNS discovery - -### **Phase 4: Observability** โœ… -- โœ… **11 Grafana Dashboards** (7 pre-configured + 4 extended): - 1. Gateway Metrics - 2. Services Overview - 3. Circuit Breakers - 4. PostgreSQL Database (13 panels) - 5. Node Exporter Infrastructure (19 panels) - 6. AlertManager Monitoring (15 panels) - 7. Business Metrics & KPIs (21 panels) - 8-11. Plus existing dashboards -- โœ… **Distributed Tracing** enabled in production -- โœ… **Comprehensive Documentation** with runbooks - ---- - -## ๐Ÿ“ Files Created/Modified - -### **New Files:** -``` -infrastructure/kubernetes/base/components/monitoring/ -โ”œโ”€โ”€ secrets.yaml # Monitoring credentials -โ”œโ”€โ”€ alertmanager.yaml # AlertManager StatefulSet (3 replicas) -โ”œโ”€โ”€ alertmanager-init.yaml # Config initialization script -โ”œโ”€โ”€ alert-rules.yaml # 50+ alert rules -โ”œโ”€โ”€ postgres-exporter.yaml # PostgreSQL monitoring -โ”œโ”€โ”€ node-exporter.yaml # Infrastructure monitoring (DaemonSet) -โ”œโ”€โ”€ grafana-dashboards-extended.yaml # 4 comprehensive dashboards -โ”œโ”€โ”€ ha-policies.yaml # PDBs + ResourceQuota + LimitRange -โ””โ”€โ”€ README.md # Complete documentation (500+ lines) -``` - -### **Modified Files:** -``` -infrastructure/kubernetes/base/components/monitoring/ -โ”œโ”€โ”€ prometheus.yaml # Now StatefulSet with 2 replicas + alert config -โ”œโ”€โ”€ grafana.yaml # Using secrets + extended dashboards mounted -โ”œโ”€โ”€ ingress.yaml # Added /alertmanager path -โ””โ”€โ”€ kustomization.yaml # Added all new resources - -infrastructure/kubernetes/overlays/prod/ -โ”œโ”€โ”€ kustomization.yaml # Enabled monitoring stack -โ””โ”€โ”€ prod-configmap.yaml # JAEGER_ENABLED=true -``` - -### **Deleted:** -``` -infrastructure/monitoring/ # Old legacy config (completely removed) -``` - ---- - -## ๐Ÿš€ Deployment Instructions - -### **1. Update Secrets (REQUIRED BEFORE DEPLOYMENT)** - -```bash -cd infrastructure/kubernetes/base/components/monitoring - -# Generate strong Grafana password -GRAFANA_PASSWORD=$(openssl rand -base64 32) - -# Update secrets.yaml with your actual values: -# - grafana-admin: admin-password -# - alertmanager-secrets: SMTP credentials -# - postgres-exporter: PostgreSQL connection string - -# Example for production: -kubectl create secret generic grafana-admin \ - --from-literal=admin-user=admin \ - --from-literal=admin-password="${GRAFANA_PASSWORD}" \ - --namespace monitoring --dry-run=client -o yaml | \ - kubectl apply -f - -``` - -### **2. Deploy to Production** - -```bash -# Apply the monitoring stack -kubectl apply -k infrastructure/kubernetes/overlays/prod - -# Verify deployment -kubectl get pods -n monitoring -kubectl get pvc -n monitoring -kubectl get svc -n monitoring -``` - -### **3. Verify Services** - -```bash -# Check Prometheus targets -kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 -# Visit: http://localhost:9090/targets - -# Check AlertManager cluster -kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093 -# Visit: http://localhost:9093 - -# Check Grafana dashboards -kubectl port-forward -n monitoring svc/grafana 3000:3000 -# Visit: http://localhost:3000 (admin / YOUR_PASSWORD) -``` - ---- - -## ๐Ÿ“ˆ What You Get Out of the Box - -### **Monitoring Coverage:** -- โœ… **Application Metrics:** Request rates, latencies (P95/P99), error rates per service -- โœ… **Database Health:** Connections, transactions, cache hit ratio, slow queries, locks -- โœ… **Infrastructure:** CPU, memory, disk I/O, network traffic per node -- โœ… **Business KPIs:** Active tenants, training jobs, alert volumes, API health -- โœ… **Distributed Traces:** Full request path tracking across microservices - -### **Alerting Capabilities:** -- โœ… **Service Down Detection:** 2-minute threshold with immediate notifications -- โœ… **Performance Degradation:** High latency, error rate, and memory alerts -- โœ… **Resource Exhaustion:** Database connections, disk space, memory limits -- โœ… **Business Logic:** Training job failures, low ML accuracy, rate limits -- โœ… **Alert System Health:** Component failures, delivery issues, capacity problems - -### **High Availability:** -- โœ… **Prometheus:** 2 independent instances, can lose 1 without data loss -- โœ… **AlertManager:** 3-node cluster, requires 2/3 for alerts to fire -- โœ… **Monitoring Resilience:** PodDisruptionBudgets ensure service during updates - ---- - -## ๐Ÿ”ง Configuration Highlights - -### **Alert Routing (Configured in AlertManager):** - -| Severity | Route | Repeat Interval | -|----------|-------|-----------------| -| Critical | critical-alerts@yourdomain.com + oncall@ | 4 hours | -| Warning | alerts@yourdomain.com | 12 hours | -| Info | alerts@yourdomain.com | 24 hours | - -**Special Routes:** -- Alert system โ†’ alert-system-team@yourdomain.com -- Database alerts โ†’ database-team@yourdomain.com -- Infrastructure โ†’ infra-team@yourdomain.com - -### **Resource Allocation:** - -| Component | Replicas | CPU Request | Memory Request | Storage | -|-----------|----------|-------------|----------------|---------| -| Prometheus | 2 | 500m | 1Gi | 20Gi ร— 2 | -| AlertManager | 3 | 100m | 128Mi | 2Gi ร— 3 | -| Grafana | 1 | 100m | 256Mi | 5Gi | -| Postgres Exporter | 1 | 50m | 64Mi | - | -| Node Exporter | 1/node | 50m | 64Mi | - | -| Jaeger | 1 | 250m | 512Mi | 10Gi | - -**Total Resources:** -- CPU Requests: ~2.5 cores -- Memory Requests: ~4Gi -- Storage: ~70Gi - -### **Data Retention:** -- Prometheus: 30 days -- Jaeger: Persistent (BadgerDB) -- Grafana: Persistent dashboards - ---- - -## ๐Ÿ” Security Considerations - -### **Implemented:** -- โœ… Grafana credentials via Kubernetes Secrets (no hardcoded passwords) -- โœ… SMTP passwords stored in Secrets -- โœ… PostgreSQL connection strings in Secrets -- โœ… Read-only filesystem for Node Exporter -- โœ… Non-root user for Node Exporter (UID 65534) -- โœ… RBAC for Prometheus (ClusterRole with minimal permissions) - -### **TODO for Production:** -- โš ๏ธ Use Sealed Secrets or External Secrets Operator -- โš ๏ธ Enable TLS for Prometheus remote write (if using) -- โš ๏ธ Configure Grafana LDAP/OAuth integration -- โš ๏ธ Set up proper certificate management for Ingress -- โš ๏ธ Review and tighten ResourceQuota limits - ---- - -## ๐Ÿ“Š Dashboard Access - -### **Production URLs (via Ingress):** -``` -https://monitoring.yourdomain.com/grafana # Grafana UI -https://monitoring.yourdomain.com/prometheus # Prometheus UI -https://monitoring.yourdomain.com/alertmanager # AlertManager UI -https://monitoring.yourdomain.com/jaeger # Jaeger UI -``` - -### **Local Access (Port Forwarding):** -```bash -# Grafana -kubectl port-forward -n monitoring svc/grafana 3000:3000 - -# Prometheus -kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 - -# AlertManager -kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093 - -# Jaeger -kubectl port-forward -n monitoring svc/jaeger-query 16686:16686 -``` - ---- - -## ๐Ÿงช Testing & Validation - -### **1. Test Alert Flow:** -```bash -# Fire a test alert (HighMemoryUsage) -kubectl run memory-hog --image=polinux/stress --restart=Never \ - --namespace=bakery-ia -- stress --vm 1 --vm-bytes 600M --timeout 300s - -# Check alert in Prometheus (should fire within 5 minutes) -# Check AlertManager received it -# Verify email notification sent -``` - -### **2. Verify Metrics Collection:** -```bash -# Check Prometheus targets (should all be UP) -curl http://localhost:9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, health: .health}' - -# Verify PostgreSQL metrics -curl http://localhost:9090/api/v1/query?query=pg_up | jq - -# Verify Node metrics -curl http://localhost:9090/api/v1/query?query=node_cpu_seconds_total | jq -``` - -### **3. Test Jaeger Tracing:** -```bash -# Make a request through the gateway -curl -H "Authorization: Bearer YOUR_TOKEN" \ - https://api.yourdomain.com/api/v1/health - -# Check trace in Jaeger UI -# Should see spans across gateway โ†’ auth โ†’ tenant services -``` - ---- - -## ๐Ÿ“– Documentation - -### **Complete Documentation Available:** -- **[README.md](infrastructure/kubernetes/base/components/monitoring/README.md)** - 500+ lines covering: - - Component overview - - Deployment instructions - - Security best practices - - Accessing services - - Dashboard descriptions - - Alert configuration - - Troubleshooting guide - - Metrics reference - - Backup & recovery procedures - - Maintenance tasks - ---- - -## โšก Performance & Scalability - -### **Current Capacity:** -- Prometheus can handle ~10M active time series -- AlertManager can process 1000s of alerts/second -- Jaeger can handle 10k spans/second -- Grafana supports 1000+ concurrent users - -### **Scaling Recommendations:** -- **> 20M time series:** Deploy Thanos for long-term storage -- **> 5k alerts/min:** Scale AlertManager to 5+ replicas -- **> 50k spans/sec:** Deploy Jaeger with Elasticsearch/Cassandra backend -- **> 5k Grafana users:** Scale Grafana horizontally with shared database - ---- - -## ๐ŸŽฏ Success Criteria - ALL MET โœ… - -- โœ… Prometheus collecting metrics from all services -- โœ… Alert rules evaluating and firing correctly -- โœ… AlertManager routing notifications to appropriate channels -- โœ… Grafana displaying real-time dashboards -- โœ… Jaeger capturing distributed traces -- โœ… High availability for all critical components -- โœ… Secure credential management -- โœ… Resource limits configured -- โœ… Documentation complete with runbooks -- โœ… No legacy code remaining - ---- - -## ๐Ÿšจ Important Notes - -1. **Update Secrets Before Deployment:** - - Change all default passwords in `secrets.yaml` - - Use strong, randomly generated passwords - - Consider using Sealed Secrets for production - -2. **Configure SMTP Settings:** - - Update AlertManager SMTP configuration in secrets - - Test email delivery before relying on alerts - -3. **Review Alert Thresholds:** - - Current thresholds are conservative - - Adjust based on your SLAs and baseline metrics - -4. **Monitor Resource Usage:** - - Prometheus storage grows over time - - Plan for capacity based on retention period - - Consider cleaning up old metrics - -5. **Backup Strategy:** - - PVCs contain critical monitoring data - - Implement backup solution for PersistentVolumes - - Test restore procedures regularly - ---- - -## ๐ŸŽ“ Next Steps (Post-MVP) - -### **Short Term (1-2 weeks):** -1. Fine-tune alert thresholds based on production data -2. Add custom business metrics to services -3. Create team-specific dashboards -4. Set up on-call rotation in AlertManager - -### **Medium Term (1-3 months):** -1. Implement SLO tracking and error budgets -2. Deploy Loki for log aggregation -3. Add anomaly detection for metrics -4. Integrate with incident management (PagerDuty/Opsgenie) - -### **Long Term (3-6 months):** -1. Deploy Thanos for long-term metrics storage -2. Implement cost tracking and chargeback per tenant -3. Add continuous profiling (Pyroscope) -4. Build ML-based alert prediction - ---- - -## ๐Ÿ“ž Support & Troubleshooting - -### **Common Issues:** - -**Issue:** Prometheus targets showing "DOWN" -```bash -# Check service discovery -kubectl get svc -n bakery-ia -kubectl get endpoints -n bakery-ia -``` - -**Issue:** AlertManager not sending notifications -```bash -# Check SMTP connectivity -kubectl exec -n monitoring alertmanager-0 -- nc -zv smtp.gmail.com 587 - -# Check AlertManager logs -kubectl logs -n monitoring alertmanager-0 -f -``` - -**Issue:** Grafana dashboards showing "No Data" -```bash -# Verify Prometheus datasource -kubectl port-forward -n monitoring svc/grafana 3000:3000 -# Login โ†’ Configuration โ†’ Data Sources โ†’ Test - -# Check Prometheus has data -kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 -# Visit /graph and run query: up -``` - -### **Getting Help:** -- Check logs: `kubectl logs -n monitoring POD_NAME` -- Check events: `kubectl get events -n monitoring` -- Review documentation: `infrastructure/kubernetes/base/components/monitoring/README.md` -- Prometheus troubleshooting: https://prometheus.io/docs/prometheus/latest/troubleshooting/ -- Grafana troubleshooting: https://grafana.com/docs/grafana/latest/troubleshooting/ - ---- - -## โœ… Deployment Checklist - -Before going to production, verify: - -- [ ] All secrets updated with production values -- [ ] SMTP configuration tested and working -- [ ] Grafana admin password changed from default -- [ ] PostgreSQL connection string configured -- [ ] Test alert fired and received via email -- [ ] All Prometheus targets are UP -- [ ] Grafana dashboards loading data -- [ ] Jaeger receiving traces -- [ ] Resource quotas appropriate for cluster size -- [ ] Backup strategy implemented for PVCs -- [ ] Team trained on accessing monitoring tools -- [ ] Runbooks reviewed and understood -- [ ] On-call rotation configured (if applicable) - ---- - -## ๐ŸŽ‰ Summary - -**You now have a production-ready monitoring stack with:** - -- โœ… **Complete Observability:** Metrics, logs (via stdout), and traces -- โœ… **Intelligent Alerting:** 50+ rules with smart routing and inhibition -- โœ… **Rich Visualization:** 11 dashboards covering all aspects of the system -- โœ… **High Availability:** HA for Prometheus and AlertManager -- โœ… **Security:** Secrets management, RBAC, read-only containers -- โœ… **Documentation:** Comprehensive guides and runbooks -- โœ… **Scalability:** Ready to handle production traffic - -**The monitoring MVP is COMPLETE and READY FOR PRODUCTION DEPLOYMENT!** ๐Ÿš€ - ---- - -*Generated: 2026-01-07* -*Version: 1.0.0 - Production MVP* -*Implementation Time: ~3 hours* diff --git a/docs/PILOT_LAUNCH_GUIDE.md b/docs/PILOT_LAUNCH_GUIDE.md index f0f95550..c6e2a790 100644 --- a/docs/PILOT_LAUNCH_GUIDE.md +++ b/docs/PILOT_LAUNCH_GUIDE.md @@ -584,23 +584,39 @@ docker push YOUR_VPS_IP:32000/bakery/auth-service ### Step 2: Update Production Configuration -```bash -# On local machine, edit these files: +The production configuration is already set up for **bakewise.ai** domain: +**Production URLs:** +- **Main Application:** https://bakewise.ai +- **API Endpoints:** https://bakewise.ai/api/v1/... +- **Monitoring Dashboard:** https://monitoring.bakewise.ai/grafana +- **Prometheus:** https://monitoring.bakewise.ai/prometheus +- **SigNoz (Traces/Metrics/Logs):** https://monitoring.bakewise.ai/signoz +- **AlertManager:** https://monitoring.bakewise.ai/alertmanager + +```bash +# Verify the configuration is correct: +cat infrastructure/kubernetes/overlays/prod/prod-ingress.yaml | grep -A 3 "host:" + +# Expected output should show: +# - host: bakewise.ai +# - host: monitoring.bakewise.ai + +# Verify CORS configuration +cat infrastructure/kubernetes/overlays/prod/prod-configmap.yaml | grep CORS + +# Expected: CORS_ORIGINS: "https://bakewise.ai" +``` + +**If using a different domain**, update these files: +```bash # 1. Update domain names nano infrastructure/kubernetes/overlays/prod/prod-ingress.yaml -# Replace: -# - bakery.yourdomain.com โ†’ bakery.your-actual-domain.com -# - api.yourdomain.com โ†’ api.your-actual-domain.com -# - monitoring.yourdomain.com โ†’ monitoring.your-actual-domain.com -# - Update CORS origins -# - Update cert-manager email +# Replace bakewise.ai with your domain # 2. Update ConfigMap nano infrastructure/kubernetes/overlays/prod/prod-configmap.yaml -# Set: -# - DOMAIN: "your-actual-domain.com" -# - CORS_ORIGINS: "https://bakery.your-actual-domain.com,https://www.your-actual-domain.com" +# Update CORS_ORIGINS # 3. Verify image names (if using custom registry) nano infrastructure/kubernetes/overlays/prod/kustomization.yaml @@ -840,22 +856,96 @@ kubectl logs -n bakery-ia deployment/auth-service | grep -i "email\|smtp" ## Post-Deployment -### Step 1: Enable Monitoring +### Step 1: Access Monitoring Stack -```bash -# Monitoring is already configured, verify it's running -kubectl get pods -n monitoring +Your production monitoring stack provides complete observability with multiple tools: -# Access Grafana -kubectl port-forward -n monitoring svc/grafana 3000:3000 +#### Production Monitoring URLs -# Visit http://localhost:3000 -# Login: admin / (password from monitoring secrets) - -# Check dashboards are working +Access via domain (recommended): +``` +https://monitoring.bakewise.ai/grafana # Dashboards & visualization +https://monitoring.bakewise.ai/prometheus # Metrics & queries +https://monitoring.bakewise.ai/signoz # Unified observability platform (traces, metrics, logs) +https://monitoring.bakewise.ai/alertmanager # Alert management ``` -### Step 2: Configure Backups +Or via port forwarding (if needed): +```bash +# Grafana +kubectl port-forward -n monitoring svc/grafana 3000:3000 & + +# Prometheus +kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 & + +# SigNoz +kubectl port-forward -n monitoring svc/signoz-frontend 3301:3301 & + +# AlertManager +kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093 & +``` + +#### Available Dashboards + +Login to Grafana (admin / your-password) and explore: + +**Main Dashboards:** +1. **Gateway Metrics** - HTTP request rates, latencies, error rates +2. **Services Overview** - Multi-service health and performance +3. **Circuit Breakers** - Reliability metrics + +**Extended Dashboards:** +4. **Service Performance Monitoring (SPM)** - RED metrics from distributed traces +5. **PostgreSQL Database** - Database health, connections, query performance +6. **Node Exporter Infrastructure** - CPU, memory, disk, network per node +7. **AlertManager Monitoring** - Alert tracking and notification status +8. **Business Metrics & KPIs** - Tenant activity, ML jobs, forecasts + +#### Quick Health Check + +```bash +# Verify all monitoring pods are running +kubectl get pods -n monitoring + +# Check Prometheus targets (all should be UP) +kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 +# Open: http://localhost:9090/targets + +# View active alerts +kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 +# Open: http://localhost:9090/alerts +``` + +### Step 2: Configure Alerting + +Update AlertManager with your notification email addresses: + +```bash +# Edit alertmanager configuration +kubectl edit configmap -n monitoring alertmanager-config + +# Update recipient emails in the routes section: +# - alerts@bakewise.ai (general alerts) +# - critical-alerts@bakewise.ai (critical issues) +# - oncall@bakewise.ai (on-call rotation) +``` + +Test alert delivery: +```bash +# Fire a test alert +kubectl run memory-test --image=polinux/stress --restart=Never \ + --namespace=bakery-ia -- stress --vm 1 --vm-bytes 600M --timeout 300s + +# Check alert appears in AlertManager +# https://monitoring.bakewise.ai/alertmanager + +# Verify email notification received + +# Clean up test +kubectl delete pod memory-test -n bakery-ia +``` + +### Step 3: Configure Backups ```bash # Create backup script on VPS @@ -902,26 +992,82 @@ kubectl edit configmap -n monitoring alertmanager-config # Update recipient emails in the routes section ``` -### Step 4: Document Everything +### Step 4: Verify Monitoring is Working -Create a runbook with: -- [ ] VPS login credentials (stored securely) +Before proceeding, ensure all monitoring components are operational: + +```bash +# 1. Check Prometheus targets +# Open: https://monitoring.bakewise.ai/prometheus/targets +# All targets should show "UP" status + +# 2. Verify Grafana dashboards load data +# Open: https://monitoring.bakewise.ai/grafana +# Navigate to any dashboard and verify metrics are displaying + +# 3. Check SigNoz is receiving traces +# Open: https://monitoring.bakewise.ai/signoz +# Search for traces from "gateway" service + +# 4. Verify AlertManager cluster +# Open: https://monitoring.bakewise.ai/alertmanager +# Check that all 3 AlertManager instances are connected +``` + +### Step 5: Document Everything + +Create a secure runbook with all credentials and procedures: + +**Essential Information to Document:** +- [ ] VPS login credentials (stored securely in password manager) - [ ] Database passwords (in password manager) -- [ ] Domain registrar access +- [ ] Grafana admin password +- [ ] Domain registrar access (for bakewise.ai) - [ ] Cloudflare access -- [ ] Email service credentials +- [ ] Email service credentials (SMTP) - [ ] WhatsApp API credentials - [ ] Docker Hub / Registry credentials - [ ] Emergency contact information - [ ] Rollback procedures +- [ ] Monitoring URLs and access procedures -### Step 5: Train Your Team +### Step 6: Train Your Team -- [ ] Show team how to access Grafana dashboards -- [ ] Demonstrate how to check logs: `kubectl logs` -- [ ] Explain how to restart services if needed -- [ ] Share this documentation with the team -- [ ] Setup on-call rotation (if applicable) +Conduct a training session covering: + +- [ ] **Access monitoring dashboards** + - Show how to login to https://monitoring.bakewise.ai/grafana + - Walk through key dashboards (Services Overview, Database, Infrastructure) + - Explain how to interpret metrics and identify issues + +- [ ] **Check application logs** + ```bash + # View logs for a service + kubectl logs -n bakery-ia deployment/orders-service --tail=100 -f + + # Search for errors + kubectl logs -n bakery-ia deployment/gateway | grep ERROR + ``` + +- [ ] **Restart services when needed** + ```bash + # Restart a service (rolling update, no downtime) + kubectl rollout restart deployment/orders-service -n bakery-ia + ``` + +- [ ] **Respond to alerts** + - Show how to access AlertManager at https://monitoring.bakewise.ai/alertmanager + - Review common alerts and their resolution steps + - Reference the [Production Operations Guide](./PRODUCTION_OPERATIONS_GUIDE.md) + +- [ ] **Share documentation** + - [PILOT_LAUNCH_GUIDE.md](./PILOT_LAUNCH_GUIDE.md) - This guide + - [PRODUCTION_OPERATIONS_GUIDE.md](./PRODUCTION_OPERATIONS_GUIDE.md) - Daily operations + - [security-checklist.md](./security-checklist.md) - Security procedures + +- [ ] **Setup on-call rotation** (if applicable) + - Configure in AlertManager + - Document escalation procedures --- @@ -1050,16 +1196,25 @@ kubectl scale deployment monitoring -n bakery-ia --replicas=0 ## Support Resources -- **Full Monitoring Guide:** [MONITORING_DEPLOYMENT_SUMMARY.md](./MONITORING_DEPLOYMENT_SUMMARY.md) -- **Operations Guide:** [PRODUCTION_OPERATIONS_GUIDE.md](./PRODUCTION_OPERATIONS_GUIDE.md) -- **Security Guide:** [security-checklist.md](./security-checklist.md) -- **Database Security:** [database-security.md](./database-security.md) -- **TLS Configuration:** [tls-configuration.md](./tls-configuration.md) +**Documentation:** +- **Operations Guide:** [PRODUCTION_OPERATIONS_GUIDE.md](./PRODUCTION_OPERATIONS_GUIDE.md) - Daily operations, monitoring, incident response +- **Security Guide:** [security-checklist.md](./security-checklist.md) - Security procedures and compliance +- **Database Security:** [database-security.md](./database-security.md) - Database operations and TLS configuration +- **TLS Configuration:** [tls-configuration.md](./tls-configuration.md) - Certificate management +- **RBAC Implementation:** [rbac-implementation.md](./rbac-implementation.md) - Access control +**Monitoring Access:** +- **Grafana:** https://monitoring.bakewise.ai/grafana (admin / your-password) +- **Prometheus:** https://monitoring.bakewise.ai/prometheus +- **SigNoz:** https://monitoring.bakewise.ai/signoz +- **AlertManager:** https://monitoring.bakewise.ai/alertmanager + +**External Resources:** - **MicroK8s Docs:** https://microk8s.io/docs - **Kubernetes Docs:** https://kubernetes.io/docs - **Let's Encrypt:** https://letsencrypt.org/docs - **Cloudflare DNS:** https://developers.cloudflare.com/dns +- **Monitoring Stack README:** infrastructure/kubernetes/base/components/monitoring/README.md --- diff --git a/docs/PRODUCTION_OPERATIONS_GUIDE.md b/docs/PRODUCTION_OPERATIONS_GUIDE.md index 32524a96..36931fee 100644 --- a/docs/PRODUCTION_OPERATIONS_GUIDE.md +++ b/docs/PRODUCTION_OPERATIONS_GUIDE.md @@ -32,7 +32,7 @@ - **Services:** 18 microservices, 14 databases, monitoring stack - **Capacity:** 10-tenant pilot (scalable to 100+) - **Security:** TLS encryption, RBAC, audit logging -- **Monitoring:** Prometheus, Grafana, AlertManager, Jaeger +- **Monitoring:** Prometheus, Grafana, AlertManager, SigNoz **Key Metrics (10-tenant baseline):** - **Uptime Target:** 99.5% (3.65 hours downtime/month) @@ -60,10 +60,10 @@ **Production URLs:** ``` -https://monitoring.yourdomain.com/grafana # Dashboards & visualization -https://monitoring.yourdomain.com/prometheus # Metrics & alerts -https://monitoring.yourdomain.com/alertmanager # Alert management -https://monitoring.yourdomain.com/jaeger # Distributed tracing +https://monitoring.bakewise.ai/grafana # Dashboards & visualization +https://monitoring.bakewise.ai/prometheus # Metrics & alerts +https://monitoring.bakewise.ai/alertmanager # Alert management +https://monitoring.bakewise.ai/signoz # Unified observability platform (traces, metrics, logs) ``` **Port Forwarding (if ingress not available):** @@ -77,8 +77,8 @@ kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 # AlertManager kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093 -# Jaeger -kubectl port-forward -n monitoring svc/jaeger-query 16686:16686 +# SigNoz +kubectl port-forward -n monitoring svc/signoz-frontend 3301:3301 ``` ### Key Dashboards @@ -1099,13 +1099,12 @@ kubectl exec -n bakery-ia deployment/auth-db -- \ ## Support Resources **Documentation:** -- [Pilot Launch Guide](./PILOT_LAUNCH_GUIDE.md) - Initial deployment -- [Monitoring Summary](./MONITORING_DEPLOYMENT_SUMMARY.md) - Monitoring details -- [Quick Start Monitoring](./QUICK_START_MONITORING.md) - Monitoring setup -- [Security Checklist](./security-checklist.md) - Security procedures -- [Database Security](./database-security.md) - Database operations +- [Pilot Launch Guide](./PILOT_LAUNCH_GUIDE.md) - Initial deployment and setup +- [Security Checklist](./security-checklist.md) - Security procedures and compliance +- [Database Security](./database-security.md) - Database operations and best practices - [TLS Configuration](./tls-configuration.md) - Certificate management -- [RBAC Implementation](./rbac-implementation.md) - Access control +- [RBAC Implementation](./rbac-implementation.md) - Access control configuration +- [Monitoring Stack README](../infrastructure/kubernetes/base/components/monitoring/README.md) - Detailed monitoring documentation **External Resources:** - Kubernetes: https://kubernetes.io/docs @@ -1115,9 +1114,9 @@ kubectl exec -n bakery-ia deployment/auth-db -- \ - PostgreSQL: https://www.postgresql.org/docs **Emergency Contacts:** -- DevOps Team: devops@yourdomain.com -- On-Call: oncall@yourdomain.com -- Security Team: security@yourdomain.com +- DevOps Team: devops@bakewise.ai +- On-Call: oncall@bakewise.ai +- Security Team: security@bakewise.ai --- diff --git a/docs/QUICK_START_MONITORING.md b/docs/QUICK_START_MONITORING.md deleted file mode 100644 index f34f5159..00000000 --- a/docs/QUICK_START_MONITORING.md +++ /dev/null @@ -1,284 +0,0 @@ -# ๐Ÿš€ Quick Start: Deploy Monitoring to Production - -**Time to deploy: ~15 minutes** - ---- - -## Step 1: Update Secrets (5 min) - -```bash -cd infrastructure/kubernetes/base/components/monitoring - -# 1. Generate strong passwords -GRAFANA_PASS=$(openssl rand -base64 32) -echo "Grafana Password: $GRAFANA_PASS" > ~/SAVE_THIS_PASSWORD.txt - -# 2. Edit secrets.yaml and replace: -# - CHANGE_ME_IN_PRODUCTION (Grafana password) -# - SMTP settings (your email server) -# - PostgreSQL connection string (your DB) - -nano secrets.yaml -``` - -**Required Changes in secrets.yaml:** -```yaml -# Line 13: Change Grafana password -admin-password: "YOUR_STRONG_PASSWORD_HERE" - -# Lines 30-33: Update SMTP settings -smtp-host: "smtp.gmail.com:587" -smtp-username: "your-alerts@yourdomain.com" -smtp-password: "YOUR_SMTP_PASSWORD" -smtp-from: "alerts@yourdomain.com" - -# Line 49: Update PostgreSQL connection -data-source-name: "postgresql://USER:PASSWORD@postgres.bakery-ia:5432/bakery?sslmode=require" -``` - ---- - -## Step 2: Update Alert Email Addresses (2 min) - -```bash -# Edit alertmanager.yaml to set your team's email addresses -nano alertmanager.yaml - -# Update these lines (search for @yourdomain.com): -# - Line 93: to: 'alerts@yourdomain.com' -# - Line 101: to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com' -# - Line 116: to: 'alerts@yourdomain.com' -# - Line 125: to: 'alert-system-team@yourdomain.com' -# - Line 134: to: 'database-team@yourdomain.com' -# - Line 143: to: 'infra-team@yourdomain.com' -``` - ---- - -## Step 3: Deploy to Production (3 min) - -```bash -# Return to project root -cd /Users/urtzialfaro/Documents/bakery-ia - -# Deploy the entire stack -kubectl apply -k infrastructure/kubernetes/overlays/prod - -# Watch the pods come up -kubectl get pods -n monitoring -w -``` - -**Expected Output:** -``` -NAME READY STATUS RESTARTS AGE -prometheus-0 1/1 Running 0 2m -prometheus-1 1/1 Running 0 1m -alertmanager-0 2/2 Running 0 2m -alertmanager-1 2/2 Running 0 1m -alertmanager-2 2/2 Running 0 1m -grafana-xxxxx 1/1 Running 0 2m -postgres-exporter-xxxxx 1/1 Running 0 2m -node-exporter-xxxxx 1/1 Running 0 2m -jaeger-xxxxx 1/1 Running 0 2m -``` - ---- - -## Step 4: Verify Deployment (3 min) - -```bash -# Check all pods are running -kubectl get pods -n monitoring - -# Check storage is provisioned -kubectl get pvc -n monitoring - -# Check services are created -kubectl get svc -n monitoring -``` - ---- - -## Step 5: Access Dashboards (2 min) - -### **Option A: Via Ingress (if configured)** -``` -https://monitoring.yourdomain.com/grafana -https://monitoring.yourdomain.com/prometheus -https://monitoring.yourdomain.com/alertmanager -https://monitoring.yourdomain.com/jaeger -``` - -### **Option B: Via Port Forwarding** -```bash -# Grafana -kubectl port-forward -n monitoring svc/grafana 3000:3000 & - -# Prometheus -kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 & - -# AlertManager -kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093 & - -# Jaeger -kubectl port-forward -n monitoring svc/jaeger-query 16686:16686 & - -# Now access: -# - Grafana: http://localhost:3000 (admin / YOUR_PASSWORD) -# - Prometheus: http://localhost:9090 -# - AlertManager: http://localhost:9093 -# - Jaeger: http://localhost:16686 -``` - ---- - -## Step 6: Verify Everything Works (5 min) - -### **Check Prometheus Targets** -1. Open Prometheus: http://localhost:9090 -2. Go to Status โ†’ Targets -3. Verify all targets are **UP**: - - prometheus (1/1 up) - - bakery-services (multiple pods up) - - alertmanager (3/3 up) - - postgres-exporter (1/1 up) - - node-exporter (N/N up, where N = number of nodes) - -### **Check Grafana Dashboards** -1. Open Grafana: http://localhost:3000 -2. Login with admin / YOUR_PASSWORD -3. Go to Dashboards โ†’ Browse -4. You should see 11 dashboards: - - Bakery IA folder: Gateway Metrics, Services Overview, Circuit Breakers - - Bakery IA - Extended folder: PostgreSQL, Node Exporter, AlertManager, Business Metrics -5. Open any dashboard and verify data is loading - -### **Test Alert Flow** -```bash -# Fire a test alert by creating high memory pod -kubectl run memory-test --image=polinux/stress --restart=Never \ - --namespace=bakery-ia -- stress --vm 1 --vm-bytes 600M --timeout 300s - -# Wait 5 minutes, then check: -# 1. Prometheus Alerts: http://localhost:9090/alerts -# - Should see "HighMemoryUsage" firing -# 2. AlertManager: http://localhost:9093 -# - Should see the alert -# 3. Email inbox - Should receive notification - -# Clean up -kubectl delete pod memory-test -n bakery-ia -``` - -### **Verify Jaeger Tracing** -1. Make a request to your API: - ```bash - curl -H "Authorization: Bearer YOUR_TOKEN" \ - https://api.yourdomain.com/api/v1/health - ``` -2. Open Jaeger: http://localhost:16686 -3. Select a service from dropdown -4. Click "Find Traces" -5. You should see traces appearing - ---- - -## โœ… Success Criteria - -Your monitoring is working correctly if: - -- [x] All Prometheus targets show "UP" status -- [x] Grafana dashboards display metrics -- [x] AlertManager cluster shows 3/3 members -- [x] Test alert fired and email received -- [x] Jaeger shows traces from services -- [x] No pods in CrashLoopBackOff state -- [x] All PVCs are Bound - ---- - -## ๐Ÿ”ง Troubleshooting - -### **Problem: Pods not starting** -```bash -# Check pod status -kubectl describe pod POD_NAME -n monitoring - -# Check logs -kubectl logs POD_NAME -n monitoring - -# Common issues: -# - Insufficient resources: Check node capacity -# - PVC not binding: Check storage class exists -# - Image pull errors: Check network/registry access -``` - -### **Problem: Prometheus targets DOWN** -```bash -# Check if services exist -kubectl get svc -n bakery-ia - -# Check if pods have correct labels -kubectl get pods -n bakery-ia --show-labels - -# Check if pods expose metrics port (8080) -kubectl get pod POD_NAME -n bakery-ia -o yaml | grep -A 5 ports -``` - -### **Problem: Grafana shows "No Data"** -```bash -# Test Prometheus datasource -kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 - -# Run a test query in Prometheus -curl "http://localhost:9090/api/v1/query?query=up" | jq - -# If Prometheus has data but Grafana doesn't, check Grafana datasource config -``` - -### **Problem: Alerts not firing** -```bash -# Check alert rules are loaded -kubectl logs -n monitoring prometheus-0 | grep "Loading configuration" - -# Check AlertManager config -kubectl exec -n monitoring alertmanager-0 -- cat /etc/alertmanager/alertmanager.yml - -# Test SMTP connection -kubectl exec -n monitoring alertmanager-0 -- \ - nc -zv smtp.gmail.com 587 -``` - ---- - -## ๐Ÿ“ž Need Help? - -1. Check full documentation: [infrastructure/kubernetes/base/components/monitoring/README.md](infrastructure/kubernetes/base/components/monitoring/README.md) -2. Review deployment summary: [MONITORING_DEPLOYMENT_SUMMARY.md](MONITORING_DEPLOYMENT_SUMMARY.md) -3. Check Prometheus logs: `kubectl logs -n monitoring prometheus-0` -4. Check AlertManager logs: `kubectl logs -n monitoring alertmanager-0` -5. Check Grafana logs: `kubectl logs -n monitoring deployment/grafana` - ---- - -## ๐ŸŽ‰ You're Done! - -Your monitoring stack is now running in production! - -**Next steps:** -1. Save your Grafana password securely -2. Set up on-call rotation -3. Review alert thresholds and adjust as needed -4. Create team-specific dashboards -5. Train team on using monitoring tools - -**Access your monitoring:** -- Grafana: https://monitoring.yourdomain.com/grafana -- Prometheus: https://monitoring.yourdomain.com/prometheus -- AlertManager: https://monitoring.yourdomain.com/alertmanager -- Jaeger: https://monitoring.yourdomain.com/jaeger - ---- - -*Deployment time: ~15 minutes* -*Last updated: 2026-01-07* diff --git a/gateway/app/main.py b/gateway/app/main.py index 3275b50a..8c6fb1f3 100644 --- a/gateway/app/main.py +++ b/gateway/app/main.py @@ -10,7 +10,7 @@ import resource import os from fastapi import FastAPI, Request, HTTPException, Depends, WebSocket, WebSocketDisconnect from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, StreamingResponse +from fastapi.responses import JSONResponse, StreamingResponse, Response import httpx import time from shared.redis_utils import initialize_redis, close_redis, get_redis_client @@ -27,7 +27,42 @@ from app.middleware.demo_middleware import DemoMiddleware from app.middleware.read_only_mode import ReadOnlyModeMiddleware from app.routes import auth, tenant, notification, nominatim, subscription, demo, pos, geocoding, poi_context from shared.monitoring.logging import setup_logging -from shared.monitoring.metrics import MetricsCollector +from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware + +# OpenTelemetry imports +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor +from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor +from opentelemetry.instrumentation.redis import RedisInstrumentor +from opentelemetry.sdk.resources import Resource + +# Configure OpenTelemetry tracing +def setup_tracing(service_name: str = "gateway"): + """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger""" + # Create resource with service name + resource = Resource.create({"service.name": service_name}) + + # Configure OTLP exporter (sends to OpenTelemetry Collector) + otlp_exporter = OTLPSpanExporter( + endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"), + insecure=True # Use insecure connection for internal cluster communication + ) + + # Configure tracer provider + provider = TracerProvider(resource=resource) + processor = BatchSpanProcessor(otlp_exporter) + provider.add_span_processor(processor) + + # Set global tracer provider + trace.set_tracer_provider(provider) + + return provider + +# Initialize tracing +tracer_provider = setup_tracing("gateway") # Setup logging setup_logging("gateway", settings.LOG_LEVEL) @@ -75,9 +110,21 @@ app = FastAPI( redirect_slashes=False # Disable automatic trailing slash redirects ) +# Instrument FastAPI with OpenTelemetry +FastAPIInstrumentor.instrument_app(app) + +# Instrument httpx for outgoing requests +HTTPXClientInstrumentor().instrument() + +# Instrument Redis (will be active once redis client is initialized) +RedisInstrumentor().instrument() + # Initialize metrics collector metrics_collector = MetricsCollector("gateway") +# Add metrics middleware to track HTTP requests +add_metrics_middleware(app, metrics_collector) + # Redis client for SSE streaming redis_client = None @@ -182,8 +229,11 @@ async def health_check(): @app.get("/metrics") async def metrics(): - """Metrics endpoint for monitoring""" - return {"metrics": "enabled"} + """Prometheus metrics endpoint""" + return Response( + content=metrics_collector.get_metrics(), + media_type="text/plain; version=0.0.4; charset=utf-8" + ) # ================================================================ # SERVER-SENT EVENTS (SSE) HELPER FUNCTIONS diff --git a/gateway/requirements.txt b/gateway/requirements.txt index 9bf28542..33b112f5 100644 --- a/gateway/requirements.txt +++ b/gateway/requirements.txt @@ -19,3 +19,9 @@ sqlalchemy==2.0.44 asyncpg==0.30.0 cryptography==44.0.0 ortools==9.8.3296 +opentelemetry-api==1.27.0 +opentelemetry-sdk==1.27.0 +opentelemetry-instrumentation-fastapi==0.48b0 +opentelemetry-exporter-otlp-proto-grpc==1.27.0 +opentelemetry-instrumentation-httpx==0.48b0 +opentelemetry-instrumentation-redis==0.48b0 diff --git a/infrastructure/INFRASTRUCTURE_CLEANUP_SUMMARY.md b/infrastructure/INFRASTRUCTURE_CLEANUP_SUMMARY.md deleted file mode 100644 index 179fa9d7..00000000 --- a/infrastructure/INFRASTRUCTURE_CLEANUP_SUMMARY.md +++ /dev/null @@ -1,201 +0,0 @@ -# Infrastructure Cleanup Summary - -**Date:** 2026-01-07 -**Action:** Removed legacy Docker Compose infrastructure files - ---- - -## Deleted Directories and Files - -The following legacy infrastructure files have been removed as they were specific to Docker Compose deployment and are **not used** in the Kubernetes deployment: - -### โŒ Removed: -- `infrastructure/pgadmin/` - pgAdmin configuration for Docker Compose - - `pgpass` - Password file - - `servers.json` - Server definitions - -- `infrastructure/postgres/` - PostgreSQL configuration for Docker Compose - - `init-scripts/init.sql` - Database initialization - -- `infrastructure/rabbitmq/` - RabbitMQ configuration for Docker Compose - - `definitions.json` - Queue/exchange definitions - - `rabbitmq.conf` - RabbitMQ settings - -- `infrastructure/redis/` - Redis configuration for Docker Compose - - `redis.conf` - Redis settings - -- `infrastructure/terraform/` - Terraform infrastructure-as-code (unused) - - `base/`, `dev/`, `staging/`, `production/` directories - - `modules/` directory - -- `infrastructure/rabbitmq.conf` - Standalone RabbitMQ config file - -### โœ… Retained: - -#### `infrastructure/kubernetes/` -**Purpose:** Complete Kubernetes deployment manifests -**Status:** Active and required -**Contents:** -- `base/` - Base Kubernetes resources - - `components/` - All service deployments - - `databases/` - Database deployments (uses embedded configs) - - `monitoring/` - Prometheus, Grafana, AlertManager - - `migrations/` - Database migration jobs - - `secrets/` - TLS secrets and application secrets - - `configmaps/` - PostgreSQL logging config -- `overlays/` - Environment-specific configurations - - `dev/` - Development overlay - - `prod/` - Production overlay -- `encryption/` - Kubernetes secrets encryption config - -#### `infrastructure/tls/` -**Purpose:** TLS/SSL certificates for database encryption -**Status:** Active and required -**Contents:** -- `ca/` - Certificate Authority (10-year validity) - - `ca-cert.pem` - CA certificate - - `ca-key.pem` - CA private key (KEEP SECURE!) -- `postgres/` - PostgreSQL server certificates (3-year validity) - - `server-cert.pem`, `server-key.pem`, `ca-cert.pem` -- `redis/` - Redis server certificates (3-year validity) - - `redis-cert.pem`, `redis-key.pem`, `ca-cert.pem` -- `generate-certificates.sh` - Certificate generation script - ---- - -## Why These Were Removed - -### Docker Compose vs Kubernetes - -The removed files were configuration files for **Docker Compose** deployments: -- pgAdmin was used for local database management (not needed in prod) -- Standalone config files (rabbitmq.conf, redis.conf, postgres init scripts) were mounted as volumes in Docker Compose -- Terraform was an unused infrastructure-as-code attempt - -### Kubernetes Uses Different Approach - -Kubernetes deployment uses: -- **ConfigMaps** instead of config files -- **Secrets** instead of environment files -- **Kubernetes manifests** instead of docker-compose.yml -- **Built-in orchestration** instead of Terraform - -**Example:** -```yaml -# OLD (Docker Compose): -volumes: - - ./infrastructure/rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf - -# NEW (Kubernetes): -env: - - name: RABBITMQ_DEFAULT_USER - valueFrom: - secretKeyRef: - name: rabbitmq-secrets - key: RABBITMQ_USER -``` - ---- - -## Verification - -### No References Found -Searched entire codebase and confirmed **zero references** to removed folders: -```bash -grep -r "infrastructure/pgadmin" --include="*.yaml" --include="*.sh" -# No results - -grep -r "infrastructure/terraform" --include="*.yaml" --include="*.sh" -# No results -``` - -### Kubernetes Deployment Unaffected -- All services use Kubernetes ConfigMaps and Secrets -- Database configs embedded in deployment YAML files -- TLS certificates managed via Kubernetes Secrets (from `infrastructure/tls/`) - ---- - -## Current Infrastructure Structure - -``` -infrastructure/ -โ”œโ”€โ”€ kubernetes/ # โœ… ACTIVE - All K8s manifests -โ”‚ โ”œโ”€โ”€ base/ # Base resources -โ”‚ โ”‚ โ”œโ”€โ”€ components/ # Service deployments -โ”‚ โ”‚ โ”œโ”€โ”€ secrets/ # TLS secrets -โ”‚ โ”‚ โ”œโ”€โ”€ configmaps/ # Configuration -โ”‚ โ”‚ โ””โ”€โ”€ kustomization.yaml # Base kustomization -โ”‚ โ”œโ”€โ”€ overlays/ # Environment overlays -โ”‚ โ”‚ โ”œโ”€โ”€ dev/ # Development -โ”‚ โ”‚ โ””โ”€โ”€ prod/ # Production -โ”‚ โ””โ”€โ”€ encryption/ # K8s secrets encryption -โ””โ”€โ”€ tls/ # โœ… ACTIVE - TLS certificates - โ”œโ”€โ”€ ca/ # Certificate Authority - โ”œโ”€โ”€ postgres/ # PostgreSQL certs - โ”œโ”€โ”€ redis/ # Redis certs - โ””โ”€โ”€ generate-certificates.sh - -REMOVED (Docker Compose legacy): -โ”œโ”€โ”€ pgadmin/ # โŒ DELETED -โ”œโ”€โ”€ postgres/ # โŒ DELETED -โ”œโ”€โ”€ rabbitmq/ # โŒ DELETED -โ”œโ”€โ”€ redis/ # โŒ DELETED -โ”œโ”€โ”€ terraform/ # โŒ DELETED -โ””โ”€โ”€ rabbitmq.conf # โŒ DELETED -``` - ---- - -## Impact Assessment - -### โœ… No Breaking Changes -- Kubernetes deployment unchanged -- All services continue to work -- TLS certificates still available -- Production readiness maintained - -### โœ… Benefits -- Cleaner repository structure -- Less confusion about which configs are used -- Faster repository cloning (smaller size) -- Clear separation: Kubernetes-only deployment - -### โœ… Documentation Updated -- [PILOT_LAUNCH_GUIDE.md](../docs/PILOT_LAUNCH_GUIDE.md) - Uses only Kubernetes -- [PRODUCTION_OPERATIONS_GUIDE.md](../docs/PRODUCTION_OPERATIONS_GUIDE.md) - References only K8s resources -- [infrastructure/kubernetes/README.md](kubernetes/README.md) - K8s-specific documentation - ---- - -## Rollback (If Needed) - -If for any reason you need these files back, they can be restored from git: - -```bash -# View deleted files -git log --diff-filter=D --summary | grep infrastructure - -# Restore specific folder (example) -git checkout HEAD~1 -- infrastructure/pgadmin/ - -# Or restore all deleted infrastructure -git checkout HEAD~1 -- infrastructure/ -``` - -**Note:** You won't need these for Kubernetes deployment. They were Docker Compose specific. - ---- - -## Related Documentation - -- [Kubernetes README](kubernetes/README.md) - K8s deployment guide -- [TLS Configuration](../docs/tls-configuration.md) - Certificate management -- [Database Security](../docs/database-security.md) - Database encryption -- [Pilot Launch Guide](../docs/PILOT_LAUNCH_GUIDE.md) - Production deployment - ---- - -**Cleanup Performed By:** Claude Code -**Verified By:** Infrastructure analysis and grep searches -**Status:** โœ… Complete - No issues found diff --git a/infrastructure/helm/signoz-values-dev.yaml b/infrastructure/helm/signoz-values-dev.yaml new file mode 100644 index 00000000..29963f75 --- /dev/null +++ b/infrastructure/helm/signoz-values-dev.yaml @@ -0,0 +1,316 @@ +# SigNoz Helm Chart Values - Development Environment +# Optimized for local development with minimal resource usage +# +# Official Chart: https://github.com/SigNoz/charts +# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-dev.yaml + +global: + storageClass: "standard" + domain: "localhost" + +# Frontend Configuration +frontend: + replicaCount: 1 + image: + repository: signoz/frontend + tag: 0.52.3 + pullPolicy: IfNotPresent + + service: + type: ClusterIP + port: 3301 + + ingress: + enabled: true + className: nginx + annotations: + nginx.ingress.kubernetes.io/rewrite-target: /$2 + nginx.ingress.kubernetes.io/use-regex: "true" + hosts: + - host: localhost + paths: + - path: /signoz(/|$)(.*) + pathType: ImplementationSpecific + tls: [] + + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + + env: + - name: FRONTEND_REFRESH_INTERVAL + value: "30000" + +# Query Service Configuration +queryService: + replicaCount: 1 + image: + repository: signoz/query-service + tag: 0.52.3 + pullPolicy: IfNotPresent + + service: + type: ClusterIP + port: 8080 + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + env: + - name: DEPLOYMENT_TYPE + value: "kubernetes-helm" + - name: SIGNOZ_LOCAL_DB_PATH + value: "/var/lib/signoz" + + persistence: + enabled: true + size: 5Gi + storageClass: "standard" + +# AlertManager Configuration +alertmanager: + replicaCount: 1 + image: + repository: signoz/alertmanager + tag: 0.23.5 + pullPolicy: IfNotPresent + + service: + type: ClusterIP + port: 9093 + + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + + persistence: + enabled: true + size: 2Gi + storageClass: "standard" + + config: + global: + resolve_timeout: 5m + route: + group_by: ['alertname', 'cluster', 'service'] + group_wait: 10s + group_interval: 10s + repeat_interval: 12h + receiver: 'default' + receivers: + - name: 'default' + # Add email, slack, webhook configs here + +# ClickHouse Configuration - Time Series Database +clickhouse: + replicaCount: 1 + image: + repository: clickhouse/clickhouse-server + tag: 24.1.2-alpine + pullPolicy: IfNotPresent + + service: + type: ClusterIP + httpPort: 8123 + tcpPort: 9000 + + resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: 1000m + memory: 1Gi + + persistence: + enabled: true + size: 10Gi + storageClass: "standard" + + # ClickHouse configuration + config: + logger: + level: information + max_connections: 1024 + max_concurrent_queries: 100 + # Data retention (7 days for dev) + merge_tree: + parts_to_delay_insert: 150 + parts_to_throw_insert: 300 + +# OpenTelemetry Collector - Integrated with SigNoz +otelCollector: + enabled: true + replicaCount: 1 + image: + repository: signoz/signoz-otel-collector + tag: 0.102.8 + pullPolicy: IfNotPresent + + service: + type: ClusterIP + ports: + otlpGrpc: 4317 + otlpHttp: 4318 + metrics: 8888 + healthCheck: 13133 + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + # Full OTEL Collector Configuration + config: + extensions: + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: 0.0.0.0:55679 + + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + cors: + allowed_origins: + - "http://localhost" + - "https://localhost" + + # Prometheus receiver for scraping metrics + prometheus: + config: + scrape_configs: + - job_name: 'otel-collector' + scrape_interval: 30s + static_configs: + - targets: ['localhost:8888'] + + processors: + batch: + timeout: 10s + send_batch_size: 1024 + + memory_limiter: + check_interval: 1s + limit_mib: 400 + spike_limit_mib: 100 + + # Resource detection for K8s + resourcedetection: + detectors: [env, system, docker] + timeout: 5s + + # Add resource attributes + resource: + attributes: + - key: deployment.environment + value: development + action: upsert + + exporters: + # Export to SigNoz ClickHouse + clickhousetraces: + datasource: tcp://clickhouse:9000/?database=signoz_traces + timeout: 10s + + clickhousemetricswrite: + endpoint: tcp://clickhouse:9000/?database=signoz_metrics + timeout: 10s + + clickhouselogsexporter: + dsn: tcp://clickhouse:9000/?database=signoz_logs + timeout: 10s + + # Debug logging + logging: + loglevel: info + sampling_initial: 5 + sampling_thereafter: 200 + + service: + extensions: [health_check, zpages] + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch, resourcedetection, resource] + exporters: [clickhousetraces, logging] + + metrics: + receivers: [otlp, prometheus] + processors: [memory_limiter, batch, resourcedetection, resource] + exporters: [clickhousemetricswrite] + + logs: + receivers: [otlp] + processors: [memory_limiter, batch, resourcedetection, resource] + exporters: [clickhouselogsexporter, logging] + +# OpenTelemetry Collector Deployment Mode +otelCollectorDeployment: + enabled: true + mode: deployment + +# Node Exporter for infrastructure metrics (optional) +nodeExporter: + enabled: true + service: + type: ClusterIP + port: 9100 + + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + +# Schemamanager - Manages ClickHouse schema +schemamanager: + enabled: true + image: + repository: signoz/signoz-schema-migrator + tag: 0.52.3 + pullPolicy: IfNotPresent + +# Additional Configuration +serviceAccount: + create: true + annotations: {} + name: "" + +# Security Context +securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + +# Network Policies (disabled for dev) +networkPolicy: + enabled: false + +# Monitoring SigNoz itself +selfMonitoring: + enabled: true + serviceMonitor: + enabled: false diff --git a/infrastructure/helm/signoz-values-prod.yaml b/infrastructure/helm/signoz-values-prod.yaml new file mode 100644 index 00000000..d7c10bd1 --- /dev/null +++ b/infrastructure/helm/signoz-values-prod.yaml @@ -0,0 +1,471 @@ +# SigNoz Helm Chart Values - Production Environment +# High-availability configuration with resource optimization +# +# Official Chart: https://github.com/SigNoz/charts +# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-prod.yaml + +global: + storageClass: "standard" + domain: "monitoring.bakewise.ai" + +# Frontend Configuration +frontend: + replicaCount: 2 + image: + repository: signoz/frontend + tag: 0.52.3 + pullPolicy: IfNotPresent + + service: + type: ClusterIP + port: 3301 + + ingress: + enabled: true + className: nginx + annotations: + nginx.ingress.kubernetes.io/rewrite-target: /$2 + nginx.ingress.kubernetes.io/use-regex: "true" + cert-manager.io/cluster-issuer: "letsencrypt-prod" + nginx.ingress.kubernetes.io/ssl-redirect: "true" + hosts: + - host: monitoring.bakewise.ai + paths: + - path: /signoz(/|$)(.*) + pathType: ImplementationSpecific + tls: + - secretName: signoz-tls + hosts: + - monitoring.bakewise.ai + + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: 500m + memory: 1Gi + + # Pod Anti-affinity for HA + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - signoz-frontend + topologyKey: kubernetes.io/hostname + + env: + - name: FRONTEND_REFRESH_INTERVAL + value: "30000" + +# Query Service Configuration +queryService: + replicaCount: 2 + image: + repository: signoz/query-service + tag: 0.52.3 + pullPolicy: IfNotPresent + + service: + type: ClusterIP + port: 8080 + + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 1000m + memory: 2Gi + + # Pod Anti-affinity for HA + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - signoz-query-service + topologyKey: kubernetes.io/hostname + + env: + - name: DEPLOYMENT_TYPE + value: "kubernetes-helm" + - name: SIGNOZ_LOCAL_DB_PATH + value: "/var/lib/signoz" + - name: RETENTION_DAYS + value: "30" + + persistence: + enabled: true + size: 20Gi + storageClass: "standard" + + # Horizontal Pod Autoscaler + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 5 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 80 + +# AlertManager Configuration +alertmanager: + replicaCount: 2 + image: + repository: signoz/alertmanager + tag: 0.23.5 + pullPolicy: IfNotPresent + + service: + type: ClusterIP + port: 9093 + + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: 500m + memory: 1Gi + + # Pod Anti-affinity for HA + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - signoz-alertmanager + topologyKey: kubernetes.io/hostname + + persistence: + enabled: true + size: 5Gi + storageClass: "standard" + + config: + global: + resolve_timeout: 5m + smtp_smarthost: 'smtp.gmail.com:587' + smtp_from: 'alerts@bakewise.ai' + smtp_auth_username: 'alerts@bakewise.ai' + smtp_auth_password: '${SMTP_PASSWORD}' + smtp_require_tls: true + + route: + group_by: ['alertname', 'cluster', 'service', 'severity'] + group_wait: 10s + group_interval: 10s + repeat_interval: 12h + receiver: 'critical-alerts' + routes: + - match: + severity: critical + receiver: 'critical-alerts' + continue: true + - match: + severity: warning + receiver: 'warning-alerts' + + receivers: + - name: 'critical-alerts' + email_configs: + - to: 'critical-alerts@bakewise.ai' + headers: + Subject: '[CRITICAL] {{ .GroupLabels.alertname }} - Bakery IA' + # Slack webhook for critical alerts + slack_configs: + - api_url: '${SLACK_WEBHOOK_URL}' + channel: '#alerts-critical' + title: '[CRITICAL] {{ .GroupLabels.alertname }}' + text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}' + + - name: 'warning-alerts' + email_configs: + - to: 'oncall@bakewise.ai' + headers: + Subject: '[WARNING] {{ .GroupLabels.alertname }} - Bakery IA' + +# ClickHouse Configuration - Time Series Database +clickhouse: + replicaCount: 2 + image: + repository: clickhouse/clickhouse-server + tag: 24.1.2-alpine + pullPolicy: IfNotPresent + + service: + type: ClusterIP + httpPort: 8123 + tcpPort: 9000 + + resources: + requests: + cpu: 1000m + memory: 2Gi + limits: + cpu: 2000m + memory: 4Gi + + # Pod Anti-affinity for HA + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app + operator: In + values: + - signoz-clickhouse + topologyKey: kubernetes.io/hostname + + persistence: + enabled: true + size: 100Gi + storageClass: "standard" + + # ClickHouse configuration + config: + logger: + level: information + max_connections: 4096 + max_concurrent_queries: 500 + # Data retention (30 days for prod) + merge_tree: + parts_to_delay_insert: 150 + parts_to_throw_insert: 300 + # Performance tuning + max_memory_usage: 10000000000 + max_bytes_before_external_group_by: 20000000000 + + # Backup configuration + backup: + enabled: true + schedule: "0 2 * * *" + retention: 7 + +# OpenTelemetry Collector - Integrated with SigNoz +otelCollector: + enabled: true + replicaCount: 2 + image: + repository: signoz/signoz-otel-collector + tag: 0.102.8 + pullPolicy: IfNotPresent + + service: + type: ClusterIP + ports: + otlpGrpc: 4317 + otlpHttp: 4318 + metrics: 8888 + healthCheck: 13133 + + resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: 1000m + memory: 1Gi + + # Full OTEL Collector Configuration + config: + extensions: + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: 0.0.0.0:55679 + + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + max_recv_msg_size_mib: 16 + http: + endpoint: 0.0.0.0:4318 + cors: + allowed_origins: + - "https://monitoring.bakewise.ai" + - "https://*.bakewise.ai" + + # Prometheus receiver for scraping metrics + prometheus: + config: + scrape_configs: + - job_name: 'otel-collector' + scrape_interval: 30s + static_configs: + - targets: ['localhost:8888'] + + processors: + batch: + timeout: 10s + send_batch_size: 2048 + send_batch_max_size: 4096 + + memory_limiter: + check_interval: 1s + limit_mib: 800 + spike_limit_mib: 200 + + # Resource detection for K8s + resourcedetection: + detectors: [env, system, docker] + timeout: 5s + + # Add resource attributes + resource: + attributes: + - key: deployment.environment + value: production + action: upsert + - key: cluster.name + value: bakery-ia-prod + action: upsert + + exporters: + # Export to SigNoz ClickHouse + clickhousetraces: + datasource: tcp://clickhouse:9000/?database=signoz_traces + timeout: 10s + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + + clickhousemetricswrite: + endpoint: tcp://clickhouse:9000/?database=signoz_metrics + timeout: 10s + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + + clickhouselogsexporter: + dsn: tcp://clickhouse:9000/?database=signoz_logs + timeout: 10s + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + + # Minimal logging for prod + logging: + loglevel: warn + sampling_initial: 2 + sampling_thereafter: 500 + + service: + extensions: [health_check, zpages] + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch, resourcedetection, resource] + exporters: [clickhousetraces, logging] + + metrics: + receivers: [otlp, prometheus] + processors: [memory_limiter, batch, resourcedetection, resource] + exporters: [clickhousemetricswrite] + + logs: + receivers: [otlp] + processors: [memory_limiter, batch, resourcedetection, resource] + exporters: [clickhouselogsexporter, logging] + +# OpenTelemetry Collector Deployment Mode +otelCollectorDeployment: + enabled: true + mode: deployment + + # HPA for OTEL Collector + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 10 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 80 + +# Node Exporter for infrastructure metrics +nodeExporter: + enabled: true + service: + type: ClusterIP + port: 9100 + + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + +# Schemamanager - Manages ClickHouse schema +schemamanager: + enabled: true + image: + repository: signoz/signoz-schema-migrator + tag: 0.52.3 + pullPolicy: IfNotPresent + +# Additional Configuration +serviceAccount: + create: true + annotations: {} + name: "signoz" + +# Security Context +securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + +# Pod Disruption Budgets for HA +podDisruptionBudget: + frontend: + enabled: true + minAvailable: 1 + queryService: + enabled: true + minAvailable: 1 + alertmanager: + enabled: true + minAvailable: 1 + clickhouse: + enabled: true + minAvailable: 1 + +# Network Policies for security +networkPolicy: + enabled: true + policyTypes: + - Ingress + - Egress + +# Monitoring SigNoz itself +selfMonitoring: + enabled: true + serviceMonitor: + enabled: true + interval: 30s diff --git a/infrastructure/kubernetes/README.md b/infrastructure/kubernetes/README.md index 678b7ede..8c42f4e7 100644 --- a/infrastructure/kubernetes/README.md +++ b/infrastructure/kubernetes/README.md @@ -4,7 +4,7 @@ This directory contains Kubernetes manifests for deploying the Bakery IA platfor ## Quick Start -Deploy the entire platform with these 5 commands: +Deploy the entire platform with these 4 commands: ```bash # 1. Start Colima with adequate resources @@ -17,15 +17,14 @@ kind create cluster --config kind-config.yaml kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml kubectl wait --namespace ingress-nginx --for=condition=ready pod --selector=app.kubernetes.io/component=controller --timeout=300s -# 4. Configure permanent localhost access -kubectl patch svc ingress-nginx-controller -n ingress-nginx -p '{"spec":{"type":"NodePort","ports":[{"name":"http","port":80,"targetPort":"http","nodePort":30080},{"name":"https","port":443,"targetPort":"https","nodePort":30443}]}}' +# 4. Deploy with Tilt +tilt up -# 5. Deploy with Skaffold -skaffold dev --profile=dev - -# ๐ŸŽ‰ Access at: https://localhost +# ๐ŸŽ‰ Access at: http://localhost (or see Tilt for individual service ports) ``` +> **Note**: The kind-config.yaml already configures port mappings (30080โ†’80, 30443โ†’443) for localhost access, so no additional service patching is needed. The NGINX Ingress for Kind uses NodePort by default on those exact ports. + ## Prerequisites Install the following tools on macOS: @@ -100,11 +99,11 @@ Then access via: ### Start Development Environment ```bash -# Start development mode with hot-reload -skaffold dev --profile=dev +# Start development mode with hot-reload using Tilt +tilt up -# Or one-time deployment -skaffold run --profile=dev +# Or start in background +tilt up --stream ``` ### Key Features @@ -246,13 +245,39 @@ colima stop --profile k8s-local ### Restart Sequence ```bash -# Post-restart startup +# Post-restart startup (or use kubernetes_restart.sh script) colima start --cpu 6 --memory 12 --disk 120 --runtime docker --profile k8s-local kind create cluster --config kind-config.yaml -skaffold dev --profile=dev +kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml +kubectl wait --namespace ingress-nginx --for=condition=ready pod --selector=app.kubernetes.io/component=controller --timeout=300s +tilt up ``` -## Production Considerations +## Production Deployment + +### Production URLs + +The production environment uses the following domains: + +- **Main Application**: https://bakewise.ai + - Frontend application and all public pages + - API endpoints: https://bakewise.ai/api/v1/... + +- **Monitoring Stack**: https://monitoring.bakewise.ai + - Grafana: https://monitoring.bakewise.ai/grafana + - Prometheus: https://monitoring.bakewise.ai/prometheus + - Jaeger: https://monitoring.bakewise.ai/jaeger + - AlertManager: https://monitoring.bakewise.ai/alertmanager + +### Production Configuration + +The production overlay (`overlays/prod/`) includes: +- **Domain Configuration**: bakewise.ai with Let's Encrypt certificates +- **High Availability**: Multi-replica deployments (2-3 replicas per service) +- **Enhanced Security**: Rate limiting, CORS restrictions, security headers +- **Monitoring**: Full observability stack with Prometheus, Grafana, Jaeger + +### Production Considerations For production deployment: @@ -263,6 +288,7 @@ For production deployment: - **External Secrets**: Use managed secret services - **TLS**: Production Let's Encrypt certificates - **CI/CD**: Automated deployment pipelines +- **DNS**: Configure DNS A/CNAME records pointing to your cluster's load balancer ## Next Steps diff --git a/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml b/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml index 4501e358..c9e487f5 100644 --- a/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml +++ b/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml @@ -48,6 +48,9 @@ spec: name: pos-integration-secrets - secretRef: name: whatsapp-secrets + env: + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://otel-collector.monitoring.svc.cluster.local:4317" resources: requests: memory: "256Mi" diff --git a/infrastructure/kubernetes/base/components/monitoring/alert-rules.yaml b/infrastructure/kubernetes/base/components/monitoring/alert-rules.yaml deleted file mode 100644 index f9af3018..00000000 --- a/infrastructure/kubernetes/base/components/monitoring/alert-rules.yaml +++ /dev/null @@ -1,429 +0,0 @@ ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-alert-rules - namespace: monitoring -data: - alert-rules.yml: | - groups: - # Basic Infrastructure Alerts - - name: bakery_services - interval: 30s - rules: - - alert: ServiceDown - expr: up{job="bakery-services"} == 0 - for: 2m - labels: - severity: critical - component: infrastructure - annotations: - summary: "Service {{ $labels.service }} is down" - description: "Service {{ $labels.service }} in namespace {{ $labels.namespace }} has been down for more than 2 minutes." - runbook_url: "https://runbooks.bakery-ia.local/ServiceDown" - - - alert: HighErrorRate - expr: | - ( - sum(rate(http_requests_total{status_code=~"5..", job="bakery-services"}[5m])) by (service) - / - sum(rate(http_requests_total{job="bakery-services"}[5m])) by (service) - ) > 0.10 - for: 5m - labels: - severity: critical - component: application - annotations: - summary: "High error rate on {{ $labels.service }}" - description: "Service {{ $labels.service }} has error rate above 10% (current: {{ $value | humanizePercentage }})." - runbook_url: "https://runbooks.bakery-ia.local/HighErrorRate" - - - alert: HighResponseTime - expr: | - histogram_quantile(0.95, - sum(rate(http_request_duration_seconds_bucket{job="bakery-services"}[5m])) by (service, le) - ) > 1 - for: 5m - labels: - severity: warning - component: performance - annotations: - summary: "High response time on {{ $labels.service }}" - description: "Service {{ $labels.service }} P95 latency is above 1 second (current: {{ $value }}s)." - runbook_url: "https://runbooks.bakery-ia.local/HighResponseTime" - - - alert: HighMemoryUsage - expr: | - container_memory_usage_bytes{namespace="bakery-ia", container!=""} > 500000000 - for: 5m - labels: - severity: warning - component: infrastructure - annotations: - summary: "High memory usage in {{ $labels.pod }}" - description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using more than 500MB of memory (current: {{ $value | humanize }}B)." - runbook_url: "https://runbooks.bakery-ia.local/HighMemoryUsage" - - - alert: DatabaseConnectionHigh - expr: | - pg_stat_database_numbackends{datname="bakery"} > 80 - for: 5m - labels: - severity: warning - component: database - annotations: - summary: "High database connection count" - description: "Database has more than 80 active connections (current: {{ $value }})." - runbook_url: "https://runbooks.bakery-ia.local/DatabaseConnectionHigh" - - # Business Logic Alerts - - name: bakery_business - interval: 30s - rules: - - alert: TrainingJobFailed - expr: | - increase(training_job_failures_total[1h]) > 0 - for: 5m - labels: - severity: warning - component: ml-training - annotations: - summary: "Training job failures detected" - description: "{{ $value }} training job(s) failed in the last hour." - runbook_url: "https://runbooks.bakery-ia.local/TrainingJobFailed" - - - alert: LowPredictionAccuracy - expr: | - prediction_model_accuracy < 0.70 - for: 15m - labels: - severity: warning - component: ml-inference - annotations: - summary: "Model prediction accuracy is low" - description: "Model {{ $labels.model_name }} accuracy is below 70% (current: {{ $value | humanizePercentage }})." - runbook_url: "https://runbooks.bakery-ia.local/LowPredictionAccuracy" - - - alert: APIRateLimitHit - expr: | - increase(rate_limit_hits_total[5m]) > 10 - for: 5m - labels: - severity: info - component: api-gateway - annotations: - summary: "API rate limits being hit frequently" - description: "Rate limits hit {{ $value }} times in the last 5 minutes." - runbook_url: "https://runbooks.bakery-ia.local/APIRateLimitHit" - - # Alert System Health - - name: alert_system_health - interval: 30s - rules: - - alert: AlertSystemComponentDown - expr: | - alert_system_component_health{component=~"processor|notifier|scheduler"} == 0 - for: 2m - labels: - severity: critical - component: alert-system - annotations: - summary: "Alert system component {{ $labels.component }} is unhealthy" - description: "Component {{ $labels.component }} has been unhealthy for more than 2 minutes." - runbook_url: "https://runbooks.bakery-ia.local/AlertSystemComponentDown" - - - alert: RabbitMQConnectionDown - expr: | - rabbitmq_up == 0 - for: 1m - labels: - severity: critical - component: alert-system - annotations: - summary: "RabbitMQ connection is down" - description: "Alert system has lost connection to RabbitMQ message queue." - runbook_url: "https://runbooks.bakery-ia.local/RabbitMQConnectionDown" - - - alert: RedisConnectionDown - expr: | - redis_up == 0 - for: 1m - labels: - severity: critical - component: alert-system - annotations: - summary: "Redis connection is down" - description: "Alert system has lost connection to Redis cache." - runbook_url: "https://runbooks.bakery-ia.local/RedisConnectionDown" - - - alert: NoSchedulerLeader - expr: | - sum(alert_system_scheduler_leader) == 0 - for: 5m - labels: - severity: warning - component: alert-system - annotations: - summary: "No alert scheduler leader elected" - description: "No scheduler instance has been elected as leader for 5 minutes." - runbook_url: "https://runbooks.bakery-ia.local/NoSchedulerLeader" - - # Alert System Performance - - name: alert_system_performance - interval: 30s - rules: - - alert: HighAlertProcessingErrorRate - expr: | - ( - sum(rate(alert_processing_errors_total[2m])) - / - sum(rate(alerts_processed_total[2m])) - ) > 0.10 - for: 2m - labels: - severity: critical - component: alert-system - annotations: - summary: "High alert processing error rate" - description: "Alert processing error rate is above 10% (current: {{ $value | humanizePercentage }})." - runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingErrorRate" - - - alert: HighNotificationDeliveryFailureRate - expr: | - ( - sum(rate(notification_delivery_failures_total[3m])) - / - sum(rate(notifications_sent_total[3m])) - ) > 0.05 - for: 3m - labels: - severity: warning - component: alert-system - annotations: - summary: "High notification delivery failure rate" - description: "Notification delivery failure rate is above 5% (current: {{ $value | humanizePercentage }})." - runbook_url: "https://runbooks.bakery-ia.local/HighNotificationDeliveryFailureRate" - - - alert: HighAlertProcessingLatency - expr: | - histogram_quantile(0.95, - sum(rate(alert_processing_duration_seconds_bucket[5m])) by (le) - ) > 5 - for: 5m - labels: - severity: warning - component: alert-system - annotations: - summary: "High alert processing latency" - description: "P95 alert processing latency is above 5 seconds (current: {{ $value }}s)." - runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingLatency" - - - alert: TooManySSEConnections - expr: | - sse_active_connections > 1000 - for: 2m - labels: - severity: warning - component: alert-system - annotations: - summary: "Too many active SSE connections" - description: "More than 1000 active SSE connections (current: {{ $value }})." - runbook_url: "https://runbooks.bakery-ia.local/TooManySSEConnections" - - - alert: SSEConnectionErrors - expr: | - rate(sse_connection_errors_total[3m]) > 0.5 - for: 3m - labels: - severity: warning - component: alert-system - annotations: - summary: "High rate of SSE connection errors" - description: "SSE connection error rate is {{ $value }} errors/sec." - runbook_url: "https://runbooks.bakery-ia.local/SSEConnectionErrors" - - # Alert System Business Logic - - name: alert_system_business - interval: 30s - rules: - - alert: UnusuallyHighAlertVolume - expr: | - rate(alerts_generated_total[5m]) > 2 - for: 5m - labels: - severity: warning - component: alert-system - annotations: - summary: "Unusually high alert generation volume" - description: "More than 2 alerts per second being generated (current: {{ $value }}/sec)." - runbook_url: "https://runbooks.bakery-ia.local/UnusuallyHighAlertVolume" - - - alert: NoAlertsGenerated - expr: | - rate(alerts_generated_total[30m]) == 0 - for: 15m - labels: - severity: info - component: alert-system - annotations: - summary: "No alerts generated recently" - description: "No alerts have been generated in the last 30 minutes. This might indicate a problem with alert detection." - runbook_url: "https://runbooks.bakery-ia.local/NoAlertsGenerated" - - - alert: SlowAlertResponseTime - expr: | - histogram_quantile(0.95, - sum(rate(alert_response_time_seconds_bucket[10m])) by (le) - ) > 3600 - for: 10m - labels: - severity: warning - component: alert-system - annotations: - summary: "Slow alert response times" - description: "P95 alert response time is above 1 hour (current: {{ $value | humanizeDuration }})." - runbook_url: "https://runbooks.bakery-ia.local/SlowAlertResponseTime" - - - alert: CriticalAlertsUnacknowledged - expr: | - sum(alerts_unacknowledged{severity="critical"}) > 5 - for: 10m - labels: - severity: warning - component: alert-system - annotations: - summary: "Multiple critical alerts unacknowledged" - description: "{{ $value }} critical alerts have not been acknowledged for 10+ minutes." - runbook_url: "https://runbooks.bakery-ia.local/CriticalAlertsUnacknowledged" - - # Alert System Capacity - - name: alert_system_capacity - interval: 30s - rules: - - alert: LargeSSEMessageQueues - expr: | - sse_message_queue_size > 100 - for: 5m - labels: - severity: warning - component: alert-system - annotations: - summary: "Large SSE message queues detected" - description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages queued." - runbook_url: "https://runbooks.bakery-ia.local/LargeSSEMessageQueues" - - - alert: SlowDatabaseStorage - expr: | - histogram_quantile(0.95, - sum(rate(alert_storage_duration_seconds_bucket[5m])) by (le) - ) > 1 - for: 5m - labels: - severity: warning - component: alert-system - annotations: - summary: "Slow alert database storage" - description: "P95 alert storage latency is above 1 second (current: {{ $value }}s)." - runbook_url: "https://runbooks.bakery-ia.local/SlowDatabaseStorage" - - # Alert System Critical Scenarios - - name: alert_system_critical - interval: 15s - rules: - - alert: AlertSystemDown - expr: | - up{service=~"alert-processor|notification-service"} == 0 - for: 1m - labels: - severity: critical - component: alert-system - annotations: - summary: "Alert system is completely down" - description: "Core alert system service {{ $labels.service }} is down." - runbook_url: "https://runbooks.bakery-ia.local/AlertSystemDown" - - - alert: AlertDataNotPersisted - expr: | - ( - sum(rate(alerts_processed_total[2m])) - - - sum(rate(alerts_stored_total[2m])) - ) > 0 - for: 2m - labels: - severity: critical - component: alert-system - annotations: - summary: "Alerts not being persisted to database" - description: "Alerts are being processed but not stored in the database." - runbook_url: "https://runbooks.bakery-ia.local/AlertDataNotPersisted" - - - alert: NotificationsNotDelivered - expr: | - ( - sum(rate(alerts_processed_total[3m])) - - - sum(rate(notifications_sent_total[3m])) - ) > 0 - for: 3m - labels: - severity: critical - component: alert-system - annotations: - summary: "Notifications not being delivered" - description: "Alerts are being processed but notifications are not being sent." - runbook_url: "https://runbooks.bakery-ia.local/NotificationsNotDelivered" - - # Monitoring System Self-Monitoring - - name: monitoring_health - interval: 30s - rules: - - alert: PrometheusDown - expr: up{job="prometheus"} == 0 - for: 5m - labels: - severity: critical - component: monitoring - annotations: - summary: "Prometheus is down" - description: "Prometheus monitoring system is not responding." - runbook_url: "https://runbooks.bakery-ia.local/PrometheusDown" - - - alert: AlertManagerDown - expr: up{job="alertmanager"} == 0 - for: 2m - labels: - severity: critical - component: monitoring - annotations: - summary: "AlertManager is down" - description: "AlertManager is not responding. Alerts will not be routed." - runbook_url: "https://runbooks.bakery-ia.local/AlertManagerDown" - - - alert: PrometheusStorageFull - expr: | - ( - prometheus_tsdb_storage_blocks_bytes - / - (prometheus_tsdb_storage_blocks_bytes + prometheus_tsdb_wal_size_bytes) - ) > 0.90 - for: 10m - labels: - severity: warning - component: monitoring - annotations: - summary: "Prometheus storage almost full" - description: "Prometheus storage is {{ $value | humanizePercentage }} full." - runbook_url: "https://runbooks.bakery-ia.local/PrometheusStorageFull" - - - alert: PrometheusScrapeErrors - expr: | - rate(prometheus_target_scrapes_exceeded_sample_limit_total[5m]) > 0 - for: 5m - labels: - severity: warning - component: monitoring - annotations: - summary: "Prometheus scrape errors detected" - description: "Prometheus is experiencing scrape errors for target {{ $labels.job }}." - runbook_url: "https://runbooks.bakery-ia.local/PrometheusScrapeErrors" diff --git a/infrastructure/kubernetes/base/components/monitoring/alertmanager-init.yaml b/infrastructure/kubernetes/base/components/monitoring/alertmanager-init.yaml deleted file mode 100644 index bddd8b30..00000000 --- a/infrastructure/kubernetes/base/components/monitoring/alertmanager-init.yaml +++ /dev/null @@ -1,27 +0,0 @@ ---- -# InitContainer to substitute secrets into AlertManager config -# This allows us to use environment variables from secrets in the config file -apiVersion: v1 -kind: ConfigMap -metadata: - name: alertmanager-init-script - namespace: monitoring -data: - init-config.sh: | - #!/bin/sh - set -e - - # Read the template config - TEMPLATE=$(cat /etc/alertmanager-template/alertmanager.yml) - - # Substitute environment variables - echo "$TEMPLATE" | \ - sed "s|{{ .smtp_host }}|${SMTP_HOST}|g" | \ - sed "s|{{ .smtp_from }}|${SMTP_FROM}|g" | \ - sed "s|{{ .smtp_username }}|${SMTP_USERNAME}|g" | \ - sed "s|{{ .smtp_password }}|${SMTP_PASSWORD}|g" | \ - sed "s|{{ .slack_webhook_url }}|${SLACK_WEBHOOK_URL}|g" \ - > /etc/alertmanager-final/alertmanager.yml - - echo "AlertManager config initialized successfully" - cat /etc/alertmanager-final/alertmanager.yml diff --git a/infrastructure/kubernetes/base/components/monitoring/alertmanager.yaml b/infrastructure/kubernetes/base/components/monitoring/alertmanager.yaml deleted file mode 100644 index e2f7f9a2..00000000 --- a/infrastructure/kubernetes/base/components/monitoring/alertmanager.yaml +++ /dev/null @@ -1,391 +0,0 @@ ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: alertmanager-config - namespace: monitoring -data: - alertmanager.yml: | - global: - resolve_timeout: 5m - smtp_smarthost: '{{ .smtp_host }}' - smtp_from: '{{ .smtp_from }}' - smtp_auth_username: '{{ .smtp_username }}' - smtp_auth_password: '{{ .smtp_password }}' - smtp_require_tls: true - - # Define notification templates - templates: - - '/etc/alertmanager/templates/*.tmpl' - - # Route alerts to appropriate receivers - route: - # Default receiver - receiver: 'default-email' - # Group alerts by these labels - group_by: ['alertname', 'cluster', 'service'] - # Wait time before sending initial notification - group_wait: 10s - # Wait time before sending notifications about new alerts in the group - group_interval: 10s - # Wait time before re-sending a notification - repeat_interval: 12h - - # Child routes for specific alert routing - routes: - # Critical alerts - send immediately to all channels - - match: - severity: critical - receiver: 'critical-alerts' - group_wait: 0s - group_interval: 5m - repeat_interval: 4h - continue: true - - # Warning alerts - less urgent - - match: - severity: warning - receiver: 'warning-alerts' - group_wait: 30s - group_interval: 5m - repeat_interval: 12h - - # Alert system specific alerts - - match: - component: alert-system - receiver: 'alert-system-team' - group_wait: 10s - repeat_interval: 6h - - # Database alerts - - match_re: - alertname: ^(DatabaseConnectionHigh|SlowDatabaseStorage)$ - receiver: 'database-team' - group_wait: 30s - repeat_interval: 8h - - # Infrastructure alerts - - match_re: - alertname: ^(HighMemoryUsage|ServiceDown)$ - receiver: 'infra-team' - group_wait: 30s - repeat_interval: 6h - - # Inhibition rules - prevent alert spam - inhibit_rules: - # If service is down, inhibit all other alerts for that service - - source_match: - alertname: 'ServiceDown' - target_match_re: - alertname: '(HighErrorRate|HighResponseTime|HighMemoryUsage)' - equal: ['service'] - - # If AlertSystem is completely down, inhibit component alerts - - source_match: - alertname: 'AlertSystemDown' - target_match_re: - alertname: 'AlertSystemComponent.*' - equal: ['namespace'] - - # If RabbitMQ is down, inhibit alert processing errors - - source_match: - alertname: 'RabbitMQConnectionDown' - target_match: - alertname: 'HighAlertProcessingErrorRate' - equal: ['namespace'] - - # Receivers - notification destinations - receivers: - # Default email receiver - - name: 'default-email' - email_configs: - - to: 'alerts@yourdomain.com' - headers: - Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}' - html: | - {{ range .Alerts }} -

{{ .Labels.alertname }}

-

Status: {{ .Status }}

-

Severity: {{ .Labels.severity }}

-

Service: {{ .Labels.service }}

-

Summary: {{ .Annotations.summary }}

-

Description: {{ .Annotations.description }}

-

Started: {{ .StartsAt }}

- {{ if .EndsAt }}

Ended: {{ .EndsAt }}

{{ end }} - {{ end }} - - # Critical alerts - multiple channels - - name: 'critical-alerts' - email_configs: - - to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com' - headers: - Subject: '๐Ÿšจ [CRITICAL] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}' - send_resolved: true - # Uncomment to enable Slack notifications - # slack_configs: - # - api_url: '{{ .slack_webhook_url }}' - # channel: '#alerts-critical' - # title: '๐Ÿšจ Critical Alert' - # text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}' - # send_resolved: true - - # Warning alerts - - name: 'warning-alerts' - email_configs: - - to: 'alerts@yourdomain.com' - headers: - Subject: 'โš ๏ธ [WARNING] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}' - send_resolved: true - - # Alert system team - - name: 'alert-system-team' - email_configs: - - to: 'alert-system-team@yourdomain.com' - headers: - Subject: '[Alert System] {{ .GroupLabels.alertname }}' - send_resolved: true - - # Database team - - name: 'database-team' - email_configs: - - to: 'database-team@yourdomain.com' - headers: - Subject: '[Database] {{ .GroupLabels.alertname }}' - send_resolved: true - - # Infrastructure team - - name: 'infra-team' - email_configs: - - to: 'infra-team@yourdomain.com' - headers: - Subject: '[Infrastructure] {{ .GroupLabels.alertname }}' - send_resolved: true - ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: alertmanager-templates - namespace: monitoring -data: - default.tmpl: | - {{ define "cluster" }}{{ .ExternalURL | reReplaceAll ".*alertmanager\\.(.*)" "$1" }}{{ end }} - - {{ define "slack.default.title" }} - [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.alertname }} - {{ end }} - - {{ define "slack.default.text" }} - {{ range .Alerts }} - *Alert:* {{ .Annotations.summary }} - *Description:* {{ .Annotations.description }} - *Severity:* `{{ .Labels.severity }}` - *Service:* `{{ .Labels.service }}` - {{ end }} - {{ end }} - ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: alertmanager - namespace: monitoring - labels: - app: alertmanager -spec: - serviceName: alertmanager - replicas: 3 - selector: - matchLabels: - app: alertmanager - template: - metadata: - labels: - app: alertmanager - spec: - serviceAccountName: prometheus - initContainers: - - name: init-config - image: busybox:1.36 - command: ['/bin/sh', '/scripts/init-config.sh'] - env: - - name: SMTP_HOST - valueFrom: - secretKeyRef: - name: alertmanager-secrets - key: smtp-host - - name: SMTP_USERNAME - valueFrom: - secretKeyRef: - name: alertmanager-secrets - key: smtp-username - - name: SMTP_PASSWORD - valueFrom: - secretKeyRef: - name: alertmanager-secrets - key: smtp-password - - name: SMTP_FROM - valueFrom: - secretKeyRef: - name: alertmanager-secrets - key: smtp-from - - name: SLACK_WEBHOOK_URL - valueFrom: - secretKeyRef: - name: alertmanager-secrets - key: slack-webhook-url - optional: true - volumeMounts: - - name: init-script - mountPath: /scripts - - name: config-template - mountPath: /etc/alertmanager-template - - name: config-final - mountPath: /etc/alertmanager-final - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - alertmanager - topologyKey: kubernetes.io/hostname - containers: - - name: alertmanager - image: prom/alertmanager:v0.27.0 - args: - - '--config.file=/etc/alertmanager/alertmanager.yml' - - '--storage.path=/alertmanager' - - '--cluster.listen-address=0.0.0.0:9094' - - '--cluster.peer=alertmanager-0.alertmanager.monitoring.svc.cluster.local:9094' - - '--cluster.peer=alertmanager-1.alertmanager.monitoring.svc.cluster.local:9094' - - '--cluster.peer=alertmanager-2.alertmanager.monitoring.svc.cluster.local:9094' - - '--cluster.reconnect-timeout=5m' - - '--web.external-url=http://monitoring.bakery-ia.local/alertmanager' - - '--web.route-prefix=/' - ports: - - name: web - containerPort: 9093 - - name: mesh-tcp - containerPort: 9094 - - name: mesh-udp - containerPort: 9094 - protocol: UDP - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - volumeMounts: - - name: config-final - mountPath: /etc/alertmanager - - name: templates - mountPath: /etc/alertmanager/templates - - name: storage - mountPath: /alertmanager - resources: - requests: - memory: "128Mi" - cpu: "100m" - limits: - memory: "256Mi" - cpu: "500m" - livenessProbe: - httpGet: - path: /-/healthy - port: 9093 - initialDelaySeconds: 30 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /-/ready - port: 9093 - initialDelaySeconds: 5 - periodSeconds: 5 - - # Config reloader sidecar - - name: configmap-reload - image: jimmidyson/configmap-reload:v0.12.0 - args: - - '--webhook-url=http://localhost:9093/-/reload' - - '--volume-dir=/etc/alertmanager' - volumeMounts: - - name: config-final - mountPath: /etc/alertmanager - readOnly: true - resources: - requests: - memory: "16Mi" - cpu: "10m" - limits: - memory: "32Mi" - cpu: "50m" - - volumes: - - name: init-script - configMap: - name: alertmanager-init-script - defaultMode: 0755 - - name: config-template - configMap: - name: alertmanager-config - - name: config-final - emptyDir: {} - - name: templates - configMap: - name: alertmanager-templates - - volumeClaimTemplates: - - metadata: - name: storage - spec: - accessModes: [ "ReadWriteOnce" ] - resources: - requests: - storage: 2Gi - ---- -apiVersion: v1 -kind: Service -metadata: - name: alertmanager - namespace: monitoring - labels: - app: alertmanager -spec: - type: ClusterIP - clusterIP: None - ports: - - name: web - port: 9093 - targetPort: 9093 - - name: mesh-tcp - port: 9094 - targetPort: 9094 - - name: mesh-udp - port: 9094 - targetPort: 9094 - protocol: UDP - selector: - app: alertmanager - ---- -apiVersion: v1 -kind: Service -metadata: - name: alertmanager-external - namespace: monitoring - labels: - app: alertmanager -spec: - type: ClusterIP - ports: - - name: web - port: 9093 - targetPort: 9093 - selector: - app: alertmanager diff --git a/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards-extended.yaml b/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards-extended.yaml deleted file mode 100644 index 84495bfc..00000000 --- a/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards-extended.yaml +++ /dev/null @@ -1,949 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-dashboards-extended - namespace: monitoring -data: - postgresql-dashboard.json: | - { - "dashboard": { - "title": "Bakery IA - PostgreSQL Database", - "tags": ["bakery-ia", "postgresql", "database"], - "timezone": "browser", - "refresh": "30s", - "schemaVersion": 16, - "version": 1, - "panels": [ - { - "id": 1, - "title": "Active Connections by Database", - "type": "graph", - "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}, - "targets": [ - { - "expr": "pg_stat_activity_count{state=\"active\"}", - "legendFormat": "{{datname}} - active" - }, - { - "expr": "pg_stat_activity_count{state=\"idle\"}", - "legendFormat": "{{datname}} - idle" - }, - { - "expr": "pg_stat_activity_count{state=\"idle in transaction\"}", - "legendFormat": "{{datname}} - idle tx" - } - ] - }, - { - "id": 2, - "title": "Total Connections", - "type": "stat", - "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4}, - "targets": [ - { - "expr": "sum(pg_stat_activity_count)", - "legendFormat": "Total connections" - } - ] - }, - { - "id": 3, - "title": "Max Connections", - "type": "stat", - "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4}, - "targets": [ - { - "expr": "pg_settings_max_connections", - "legendFormat": "Max connections" - } - ] - }, - { - "id": 4, - "title": "Transaction Rate (Commits vs Rollbacks)", - "type": "graph", - "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(pg_stat_database_xact_commit[5m])", - "legendFormat": "{{datname}} - commits" - }, - { - "expr": "rate(pg_stat_database_xact_rollback[5m])", - "legendFormat": "{{datname}} - rollbacks" - } - ] - }, - { - "id": 5, - "title": "Cache Hit Ratio", - "type": "graph", - "gridPos": {"x": 12, "y": 8, "w": 12, "h": 8}, - "targets": [ - { - "expr": "100 * (1 - (sum(rate(pg_stat_io_blocks_read_total[5m])) / (sum(rate(pg_stat_io_blocks_read_total[5m])) + sum(rate(pg_stat_io_blocks_hit_total[5m])))))", - "legendFormat": "Cache hit ratio %" - } - ] - }, - { - "id": 6, - "title": "Slow Queries (> 30s)", - "type": "table", - "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8}, - "targets": [ - { - "expr": "pg_slow_queries{duration_ms > 30000}", - "format": "table", - "instant": true - } - ], - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": {}, - "indexByName": {}, - "renameByName": { - "query": "Query", - "duration_ms": "Duration (ms)", - "datname": "Database" - } - } - } - ] - }, - { - "id": 7, - "title": "Dead Tuples by Table", - "type": "graph", - "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8}, - "targets": [ - { - "expr": "pg_stat_user_tables_n_dead_tup", - "legendFormat": "{{schemaname}}.{{relname}}" - } - ] - }, - { - "id": 8, - "title": "Table Bloat Estimate", - "type": "graph", - "gridPos": {"x": 0, "y": 24, "w": 12, "h": 8}, - "targets": [ - { - "expr": "100 * (pg_stat_user_tables_n_dead_tup * avg_tuple_size) / (pg_total_relation_size * 8192)", - "legendFormat": "{{schemaname}}.{{relname}} bloat %" - } - ] - }, - { - "id": 9, - "title": "Replication Lag (bytes)", - "type": "graph", - "gridPos": {"x": 12, "y": 24, "w": 12, "h": 8}, - "targets": [ - { - "expr": "pg_replication_lag_bytes", - "legendFormat": "{{slot_name}} - {{application_name}}" - } - ] - }, - { - "id": 10, - "title": "Database Size (GB)", - "type": "graph", - "gridPos": {"x": 0, "y": 32, "w": 12, "h": 8}, - "targets": [ - { - "expr": "pg_database_size_bytes / 1024 / 1024 / 1024", - "legendFormat": "{{datname}}" - } - ] - }, - { - "id": 11, - "title": "Database Size Growth (per hour)", - "type": "graph", - "gridPos": {"x": 12, "y": 32, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(pg_database_size_bytes[1h])", - "legendFormat": "{{datname}} - bytes/hour" - } - ] - }, - { - "id": 12, - "title": "Lock Counts by Type", - "type": "graph", - "gridPos": {"x": 0, "y": 40, "w": 12, "h": 8}, - "targets": [ - { - "expr": "pg_locks_count", - "legendFormat": "{{datname}} - {{locktype}} - {{mode}}" - } - ] - }, - { - "id": 13, - "title": "Query Duration (p95)", - "type": "graph", - "gridPos": {"x": 12, "y": 40, "w": 12, "h": 8}, - "targets": [ - { - "expr": "histogram_quantile(0.95, rate(pg_query_duration_seconds_bucket[5m]))", - "legendFormat": "p95" - } - ] - } - ] - } - } - - node-exporter-dashboard.json: | - { - "dashboard": { - "title": "Bakery IA - Node Exporter Infrastructure", - "tags": ["bakery-ia", "node-exporter", "infrastructure"], - "timezone": "browser", - "refresh": "15s", - "schemaVersion": 16, - "version": 1, - "panels": [ - { - "id": 1, - "title": "CPU Usage by Node", - "type": "graph", - "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}, - "targets": [ - { - "expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", - "legendFormat": "{{instance}} - {{cpu}}" - } - ] - }, - { - "id": 2, - "title": "Average CPU Usage", - "type": "stat", - "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4}, - "targets": [ - { - "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", - "legendFormat": "Average CPU %" - } - ] - }, - { - "id": 3, - "title": "CPU Load (1m, 5m, 15m)", - "type": "stat", - "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4}, - "targets": [ - { - "expr": "avg(node_load1)", - "legendFormat": "1m" - }, - { - "expr": "avg(node_load5)", - "legendFormat": "5m" - }, - { - "expr": "avg(node_load15)", - "legendFormat": "15m" - } - ] - }, - { - "id": 4, - "title": "Memory Usage by Node", - "type": "graph", - "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8}, - "targets": [ - { - "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))", - "legendFormat": "{{instance}}" - } - ] - }, - { - "id": 5, - "title": "Memory Used (GB)", - "type": "stat", - "gridPos": {"x": 12, "y": 8, "w": 6, "h": 4}, - "targets": [ - { - "expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024", - "legendFormat": "{{instance}}" - } - ] - }, - { - "id": 6, - "title": "Memory Available (GB)", - "type": "stat", - "gridPos": {"x": 18, "y": 8, "w": 6, "h": 4}, - "targets": [ - { - "expr": "node_memory_MemAvailable_bytes / 1024 / 1024 / 1024", - "legendFormat": "{{instance}}" - } - ] - }, - { - "id": 7, - "title": "Disk I/O Read Rate (MB/s)", - "type": "graph", - "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(node_disk_read_bytes_total[5m]) / 1024 / 1024", - "legendFormat": "{{instance}} - {{device}}" - } - ] - }, - { - "id": 8, - "title": "Disk I/O Write Rate (MB/s)", - "type": "graph", - "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(node_disk_written_bytes_total[5m]) / 1024 / 1024", - "legendFormat": "{{instance}} - {{device}}" - } - ] - }, - { - "id": 9, - "title": "Disk I/O Operations (IOPS)", - "type": "graph", - "gridPos": {"x": 0, "y": 24, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(node_disk_reads_completed_total[5m]) + rate(node_disk_writes_completed_total[5m])", - "legendFormat": "{{instance}} - {{device}}" - } - ] - }, - { - "id": 10, - "title": "Network Receive Rate (Mbps)", - "type": "graph", - "gridPos": {"x": 12, "y": 24, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024", - "legendFormat": "{{instance}} - {{device}}" - } - ] - }, - { - "id": 11, - "title": "Network Transmit Rate (Mbps)", - "type": "graph", - "gridPos": {"x": 0, "y": 32, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024", - "legendFormat": "{{instance}} - {{device}}" - } - ] - }, - { - "id": 12, - "title": "Network Errors", - "type": "graph", - "gridPos": {"x": 12, "y": 32, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(node_network_receive_errs_total[5m]) + rate(node_network_transmit_errs_total[5m])", - "legendFormat": "{{instance}} - {{device}}" - } - ] - }, - { - "id": 13, - "title": "Filesystem Usage by Mount", - "type": "graph", - "gridPos": {"x": 0, "y": 40, "w": 12, "h": 8}, - "targets": [ - { - "expr": "100 * (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes))", - "legendFormat": "{{instance}} - {{mountpoint}}" - } - ] - }, - { - "id": 14, - "title": "Filesystem Available (GB)", - "type": "stat", - "gridPos": {"x": 12, "y": 40, "w": 6, "h": 4}, - "targets": [ - { - "expr": "node_filesystem_avail_bytes / 1024 / 1024 / 1024", - "legendFormat": "{{instance}} - {{mountpoint}}" - } - ] - }, - { - "id": 15, - "title": "Filesystem Size (GB)", - "type": "stat", - "gridPos": {"x": 18, "y": 40, "w": 6, "h": 4}, - "targets": [ - { - "expr": "node_filesystem_size_bytes / 1024 / 1024 / 1024", - "legendFormat": "{{instance}} - {{mountpoint}}" - } - ] - }, - { - "id": 16, - "title": "Load Average (1m, 5m, 15m)", - "type": "graph", - "gridPos": {"x": 0, "y": 48, "w": 12, "h": 8}, - "targets": [ - { - "expr": "node_load1", - "legendFormat": "{{instance}} - 1m" - }, - { - "expr": "node_load5", - "legendFormat": "{{instance}} - 5m" - }, - { - "expr": "node_load15", - "legendFormat": "{{instance}} - 15m" - } - ] - }, - { - "id": 17, - "title": "System Up Time", - "type": "stat", - "gridPos": {"x": 12, "y": 48, "w": 12, "h": 8}, - "targets": [ - { - "expr": "node_boot_time_seconds", - "legendFormat": "{{instance}} - uptime" - } - ] - }, - { - "id": 18, - "title": "Context Switches", - "type": "graph", - "gridPos": {"x": 0, "y": 56, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(node_context_switches_total[5m])", - "legendFormat": "{{instance}}" - } - ] - }, - { - "id": 19, - "title": "Interrupts", - "type": "graph", - "gridPos": {"x": 12, "y": 56, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(node_intr_total[5m])", - "legendFormat": "{{instance}}" - } - ] - } - ] - } - } - - alertmanager-dashboard.json: | - { - "dashboard": { - "title": "Bakery IA - AlertManager Monitoring", - "tags": ["bakery-ia", "alertmanager", "alerting"], - "timezone": "browser", - "refresh": "10s", - "schemaVersion": 16, - "version": 1, - "panels": [ - { - "id": 1, - "title": "Active Alerts by Severity", - "type": "graph", - "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}, - "targets": [ - { - "expr": "count by (severity) (ALERTS{alertstate=\"firing\"})", - "legendFormat": "{{severity}}" - } - ] - }, - { - "id": 2, - "title": "Total Active Alerts", - "type": "stat", - "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4}, - "targets": [ - { - "expr": "count(ALERTS{alertstate=\"firing\"})", - "legendFormat": "Active alerts" - } - ] - }, - { - "id": 3, - "title": "Critical Alerts", - "type": "stat", - "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4}, - "targets": [ - { - "expr": "count(ALERTS{alertstate=\"firing\", severity=\"critical\"})", - "legendFormat": "Critical" - } - ] - }, - { - "id": 4, - "title": "Alert Firing Rate (per minute)", - "type": "graph", - "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(alertmanager_alerts_fired_total[1m])", - "legendFormat": "Alerts fired/min" - } - ] - }, - { - "id": 5, - "title": "Alert Resolution Rate (per minute)", - "type": "graph", - "gridPos": {"x": 12, "y": 8, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(alertmanager_alerts_resolved_total[1m])", - "legendFormat": "Alerts resolved/min" - } - ] - }, - { - "id": 6, - "title": "Notification Success Rate", - "type": "graph", - "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8}, - "targets": [ - { - "expr": "100 * (rate(alertmanager_notifications_total{status=\"success\"}[5m]) / rate(alertmanager_notifications_total[5m]))", - "legendFormat": "Success rate %" - } - ] - }, - { - "id": 7, - "title": "Notification Failures", - "type": "graph", - "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(alertmanager_notifications_total{status=\"failed\"}[5m])", - "legendFormat": "{{integration}}" - } - ] - }, - { - "id": 8, - "title": "Silenced Alerts", - "type": "stat", - "gridPos": {"x": 0, "y": 24, "w": 6, "h": 4}, - "targets": [ - { - "expr": "count(ALERTS{alertstate=\"silenced\"})", - "legendFormat": "Silenced" - } - ] - }, - { - "id": 9, - "title": "AlertManager Cluster Size", - "type": "stat", - "gridPos": {"x": 6, "y": 24, "w": 6, "h": 4}, - "targets": [ - { - "expr": "count(alertmanager_cluster_peers)", - "legendFormat": "Cluster peers" - } - ] - }, - { - "id": 10, - "title": "AlertManager Peers", - "type": "stat", - "gridPos": {"x": 12, "y": 24, "w": 6, "h": 4}, - "targets": [ - { - "expr": "alertmanager_cluster_peers", - "legendFormat": "{{instance}}" - } - ] - }, - { - "id": 11, - "title": "Cluster Status", - "type": "stat", - "gridPos": {"x": 18, "y": 24, "w": 6, "h": 4}, - "targets": [ - { - "expr": "up{job=\"alertmanager\"}", - "legendFormat": "{{instance}}" - } - ] - }, - { - "id": 12, - "title": "Alerts by Group", - "type": "table", - "gridPos": {"x": 0, "y": 28, "w": 12, "h": 8}, - "targets": [ - { - "expr": "count by (alertname) (ALERTS{alertstate=\"firing\"})", - "format": "table", - "instant": true - } - ], - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": {}, - "indexByName": {}, - "renameByName": { - "alertname": "Alert Name", - "Value": "Count" - } - } - } - ] - }, - { - "id": 13, - "title": "Alert Duration (p99)", - "type": "graph", - "gridPos": {"x": 12, "y": 28, "w": 12, "h": 8}, - "targets": [ - { - "expr": "histogram_quantile(0.99, rate(alertmanager_alert_duration_seconds_bucket[5m]))", - "legendFormat": "p99 duration" - } - ] - }, - { - "id": 14, - "title": "Processing Time", - "type": "graph", - "gridPos": {"x": 0, "y": 36, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(alertmanager_receiver_processing_duration_seconds_sum[5m]) / rate(alertmanager_receiver_processing_duration_seconds_count[5m])", - "legendFormat": "{{receiver}}" - } - ] - }, - { - "id": 15, - "title": "Memory Usage", - "type": "stat", - "gridPos": {"x": 12, "y": 36, "w": 12, "h": 8}, - "targets": [ - { - "expr": "process_resident_memory_bytes{job=\"alertmanager\"} / 1024 / 1024", - "legendFormat": "{{instance}} - MB" - } - ] - } - ] - } - } - - business-metrics-dashboard.json: | - { - "dashboard": { - "title": "Bakery IA - Business Metrics & KPIs", - "tags": ["bakery-ia", "business-metrics", "kpis"], - "timezone": "browser", - "refresh": "30s", - "schemaVersion": 16, - "version": 1, - "panels": [ - { - "id": 1, - "title": "Requests per Service (Rate)", - "type": "graph", - "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}, - "targets": [ - { - "expr": "sum by (service) (rate(http_requests_total[5m]))", - "legendFormat": "{{service}}" - } - ] - }, - { - "id": 2, - "title": "Total Request Rate", - "type": "stat", - "gridPos": {"x": 12, "y": 0, "w": 6, "h": 4}, - "targets": [ - { - "expr": "sum(rate(http_requests_total[5m]))", - "legendFormat": "requests/sec" - } - ] - }, - { - "id": 3, - "title": "Peak Request Rate (5m)", - "type": "stat", - "gridPos": {"x": 18, "y": 0, "w": 6, "h": 4}, - "targets": [ - { - "expr": "max(sum(rate(http_requests_total[5m])))", - "legendFormat": "Peak requests/sec" - } - ] - }, - { - "id": 4, - "title": "Error Rates by Service", - "type": "graph", - "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8}, - "targets": [ - { - "expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))", - "legendFormat": "{{service}}" - } - ] - }, - { - "id": 5, - "title": "Overall Error Rate", - "type": "stat", - "gridPos": {"x": 12, "y": 8, "w": 6, "h": 4}, - "targets": [ - { - "expr": "100 * (sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])))", - "legendFormat": "Error %" - } - ] - }, - { - "id": 6, - "title": "4xx Error Rate", - "type": "stat", - "gridPos": {"x": 18, "y": 8, "w": 6, "h": 4}, - "targets": [ - { - "expr": "100 * (sum(rate(http_requests_total{status_code=~\"4..\"}[5m])) / sum(rate(http_requests_total[5m])))", - "legendFormat": "4xx %" - } - ] - }, - { - "id": 7, - "title": "P95 Latency by Service (ms)", - "type": "graph", - "gridPos": {"x": 0, "y": 16, "w": 12, "h": 8}, - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000", - "legendFormat": "{{service}} p95" - } - ] - }, - { - "id": 8, - "title": "P99 Latency by Service (ms)", - "type": "graph", - "gridPos": {"x": 12, "y": 16, "w": 12, "h": 8}, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000", - "legendFormat": "{{service}} p99" - } - ] - }, - { - "id": 9, - "title": "Average Latency (ms)", - "type": "stat", - "gridPos": {"x": 0, "y": 24, "w": 6, "h": 4}, - "targets": [ - { - "expr": "(sum(rate(http_request_duration_seconds_sum[5m])) / sum(rate(http_request_duration_seconds_count[5m]))) * 1000", - "legendFormat": "Avg latency ms" - } - ] - }, - { - "id": 10, - "title": "Active Tenants", - "type": "stat", - "gridPos": {"x": 6, "y": 24, "w": 6, "h": 4}, - "targets": [ - { - "expr": "count(count by (tenant_id) (rate(http_requests_total[5m])))", - "legendFormat": "Active tenants" - } - ] - }, - { - "id": 11, - "title": "Requests per Tenant", - "type": "stat", - "gridPos": {"x": 12, "y": 24, "w": 12, "h": 4}, - "targets": [ - { - "expr": "sum by (tenant_id) (rate(http_requests_total[5m]))", - "legendFormat": "Tenant {{tenant_id}}" - } - ] - }, - { - "id": 12, - "title": "Alert Generation Rate (per minute)", - "type": "graph", - "gridPos": {"x": 0, "y": 32, "w": 12, "h": 8}, - "targets": [ - { - "expr": "rate(ALERTS_FOR_STATE[1m])", - "legendFormat": "{{alertname}}" - } - ] - }, - { - "id": 13, - "title": "Training Job Success Rate", - "type": "stat", - "gridPos": {"x": 12, "y": 32, "w": 12, "h": 8}, - "targets": [ - { - "expr": "100 * (sum(training_job_completed_total{status=\"success\"}) / sum(training_job_completed_total))", - "legendFormat": "Success rate %" - } - ] - }, - { - "id": 14, - "title": "Training Jobs in Progress", - "type": "stat", - "gridPos": {"x": 0, "y": 40, "w": 6, "h": 4}, - "targets": [ - { - "expr": "count(training_job_in_progress)", - "legendFormat": "Jobs running" - } - ] - }, - { - "id": 15, - "title": "Training Job Completion Time (p95, minutes)", - "type": "stat", - "gridPos": {"x": 6, "y": 40, "w": 6, "h": 4}, - "targets": [ - { - "expr": "histogram_quantile(0.95, training_job_duration_seconds) / 60", - "legendFormat": "p95 minutes" - } - ] - }, - { - "id": 16, - "title": "Failed Training Jobs", - "type": "stat", - "gridPos": {"x": 12, "y": 40, "w": 6, "h": 4}, - "targets": [ - { - "expr": "sum(training_job_completed_total{status=\"failed\"})", - "legendFormat": "Failed jobs" - } - ] - }, - { - "id": 17, - "title": "Total Training Jobs Completed", - "type": "stat", - "gridPos": {"x": 18, "y": 40, "w": 6, "h": 4}, - "targets": [ - { - "expr": "sum(training_job_completed_total)", - "legendFormat": "Total completed" - } - ] - }, - { - "id": 18, - "title": "API Health Status", - "type": "table", - "gridPos": {"x": 0, "y": 48, "w": 12, "h": 8}, - "targets": [ - { - "expr": "up{job=\"bakery-services\"}", - "format": "table", - "instant": true - } - ], - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": {}, - "indexByName": {}, - "renameByName": { - "service": "Service", - "Value": "Status", - "instance": "Instance" - } - } - } - ] - }, - { - "id": 19, - "title": "Service Success Rate (%)", - "type": "graph", - "gridPos": {"x": 12, "y": 48, "w": 12, "h": 8}, - "targets": [ - { - "expr": "100 * (1 - (sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum by (service) (rate(http_requests_total[5m]))))", - "legendFormat": "{{service}}" - } - ] - }, - { - "id": 20, - "title": "Requests Processed Today", - "type": "stat", - "gridPos": {"x": 0, "y": 56, "w": 12, "h": 4}, - "targets": [ - { - "expr": "sum(increase(http_requests_total[24h]))", - "legendFormat": "Requests (24h)" - } - ] - }, - { - "id": 21, - "title": "Distinct Users Today", - "type": "stat", - "gridPos": {"x": 12, "y": 56, "w": 12, "h": 4}, - "targets": [ - { - "expr": "count(count by (user_id) (increase(http_requests_total{user_id!=\"\"}[24h])))", - "legendFormat": "Users (24h)" - } - ] - } - ] - } - } diff --git a/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards.yaml b/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards.yaml deleted file mode 100644 index 3ea7cbe7..00000000 --- a/infrastructure/kubernetes/base/components/monitoring/grafana-dashboards.yaml +++ /dev/null @@ -1,177 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-dashboards - namespace: monitoring -data: - gateway-metrics.json: | - { - "dashboard": { - "title": "Bakery IA - Gateway Metrics", - "tags": ["bakery-ia", "gateway"], - "timezone": "browser", - "panels": [ - { - "id": 1, - "title": "Request Rate by Endpoint", - "type": "graph", - "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}, - "targets": [{ - "expr": "rate(http_requests_total{service=\"gateway\"}[5m])", - "legendFormat": "{{method}} {{endpoint}}" - }] - }, - { - "id": 2, - "title": "P95 Request Latency", - "type": "graph", - "gridPos": {"x": 12, "y": 0, "w": 12, "h": 8}, - "targets": [{ - "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"gateway\"}[5m]))", - "legendFormat": "{{endpoint}} p95" - }] - }, - { - "id": 3, - "title": "Error Rate (5xx)", - "type": "graph", - "gridPos": {"x": 0, "y": 8, "w": 12, "h": 8}, - "targets": [{ - "expr": "rate(http_requests_total{service=\"gateway\",status_code=~\"5..\"}[5m])", - "legendFormat": "{{endpoint}} errors" - }] - }, - { - "id": 4, - "title": "Active Requests", - "type": "stat", - "gridPos": {"x": 12, "y": 8, "w": 6, "h": 4}, - "targets": [{ - "expr": "sum(rate(http_requests_total{service=\"gateway\"}[1m]))" - }] - }, - { - "id": 5, - "title": "Authentication Success Rate", - "type": "stat", - "gridPos": {"x": 18, "y": 8, "w": 6, "h": 4}, - "targets": [{ - "expr": "rate(gateway_auth_responses_total[5m]) / rate(gateway_auth_requests_total[5m]) * 100" - }] - } - ], - "refresh": "10s", - "schemaVersion": 16, - "version": 1 - } - } - - services-overview.json: | - { - "dashboard": { - "title": "Bakery IA - Services Overview", - "tags": ["bakery-ia", "services"], - "timezone": "browser", - "panels": [ - { - "id": 1, - "title": "Request Rate by Service", - "type": "graph", - "gridPos": {"x": 0, "y": 0, "w": 12, "h": 8}, - "targets": [{ - "expr": "sum by (service) (rate(http_requests_total[5m]))", - "legendFormat": "{{service}}" - }] - }, - { - "id": 2, - "title": "P99 Latency by Service", - "type": "graph", - "gridPos": {"x": 12, "y": 0, "w": 12, "h": 8}, - "targets": [{ - "expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m])))", - "legendFormat": "{{service}} p99" - }] - }, - { - "id": 3, - "title": "Error Rate by Service", - "type": "graph", - "gridPos": {"x": 0, "y": 8, "w": 24, "h": 8}, - "targets": [{ - "expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))", - "legendFormat": "{{service}}" - }] - }, - { - "id": 4, - "title": "Service Health Status", - "type": "table", - "gridPos": {"x": 0, "y": 16, "w": 24, "h": 8}, - "targets": [{ - "expr": "up{job=\"bakery-services\"}", - "format": "table", - "instant": true - }], - "transformations": [{ - "id": "organize", - "options": { - "excludeByName": {}, - "indexByName": {}, - "renameByName": { - "service": "Service Name", - "Value": "Status" - } - } - }] - } - ], - "refresh": "30s", - "schemaVersion": 16, - "version": 1 - } - } - - circuit-breakers.json: | - { - "dashboard": { - "title": "Bakery IA - Circuit Breakers", - "tags": ["bakery-ia", "reliability"], - "timezone": "browser", - "panels": [ - { - "id": 1, - "title": "Circuit Breaker States", - "type": "stat", - "gridPos": {"x": 0, "y": 0, "w": 24, "h": 4}, - "targets": [{ - "expr": "circuit_breaker_state", - "legendFormat": "{{service}} - {{state}}" - }] - }, - { - "id": 2, - "title": "Circuit Breaker Trips", - "type": "graph", - "gridPos": {"x": 0, "y": 4, "w": 12, "h": 8}, - "targets": [{ - "expr": "rate(circuit_breaker_opened_total[5m])", - "legendFormat": "{{service}}" - }] - }, - { - "id": 3, - "title": "Rejected Requests", - "type": "graph", - "gridPos": {"x": 12, "y": 4, "w": 12, "h": 8}, - "targets": [{ - "expr": "rate(circuit_breaker_rejected_total[5m])", - "legendFormat": "{{service}}" - }] - } - ], - "refresh": "10s", - "schemaVersion": 16, - "version": 1 - } - } diff --git a/infrastructure/kubernetes/base/components/monitoring/grafana.yaml b/infrastructure/kubernetes/base/components/monitoring/grafana.yaml deleted file mode 100644 index c48847f1..00000000 --- a/infrastructure/kubernetes/base/components/monitoring/grafana.yaml +++ /dev/null @@ -1,166 +0,0 @@ ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-datasources - namespace: monitoring -data: - prometheus.yaml: | - apiVersion: 1 - datasources: - - name: Prometheus - type: prometheus - access: proxy - url: http://prometheus:9090 - isDefault: true - editable: false - ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-dashboards-config - namespace: monitoring -data: - dashboards.yaml: | - apiVersion: 1 - providers: - - name: 'default' - orgId: 1 - folder: 'Bakery IA' - type: file - disableDeletion: false - updateIntervalSeconds: 10 - allowUiUpdates: true - options: - path: /var/lib/grafana/dashboards - - name: 'extended' - orgId: 1 - folder: 'Bakery IA - Extended' - type: file - disableDeletion: false - updateIntervalSeconds: 10 - allowUiUpdates: true - options: - path: /var/lib/grafana/dashboards-extended - ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: grafana - namespace: monitoring - labels: - app: grafana -spec: - replicas: 1 - selector: - matchLabels: - app: grafana - template: - metadata: - labels: - app: grafana - spec: - containers: - - name: grafana - image: grafana/grafana:12.3.0 - ports: - - containerPort: 3000 - name: http - env: - - name: GF_SECURITY_ADMIN_USER - valueFrom: - secretKeyRef: - name: grafana-admin - key: admin-user - - name: GF_SECURITY_ADMIN_PASSWORD - valueFrom: - secretKeyRef: - name: grafana-admin - key: admin-password - - name: GF_SERVER_ROOT_URL - value: "http://monitoring.bakery-ia.local/grafana" - - name: GF_SERVER_SERVE_FROM_SUB_PATH - value: "true" - - name: GF_AUTH_ANONYMOUS_ENABLED - value: "false" - - name: GF_INSTALL_PLUGINS - value: "" - volumeMounts: - - name: grafana-storage - mountPath: /var/lib/grafana - - name: grafana-datasources - mountPath: /etc/grafana/provisioning/datasources - - name: grafana-dashboards-config - mountPath: /etc/grafana/provisioning/dashboards - - name: grafana-dashboards - mountPath: /var/lib/grafana/dashboards - - name: grafana-dashboards-extended - mountPath: /var/lib/grafana/dashboards-extended - resources: - requests: - memory: "256Mi" - cpu: "100m" - limits: - memory: "512Mi" - cpu: "500m" - livenessProbe: - httpGet: - path: /api/health - port: 3000 - initialDelaySeconds: 30 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /api/health - port: 3000 - initialDelaySeconds: 5 - periodSeconds: 5 - volumes: - - name: grafana-storage - persistentVolumeClaim: - claimName: grafana-storage - - name: grafana-datasources - configMap: - name: grafana-datasources - - name: grafana-dashboards-config - configMap: - name: grafana-dashboards-config - - name: grafana-dashboards - configMap: - name: grafana-dashboards - - name: grafana-dashboards-extended - configMap: - name: grafana-dashboards-extended - ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: grafana-storage - namespace: monitoring -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 5Gi - ---- -apiVersion: v1 -kind: Service -metadata: - name: grafana - namespace: monitoring - labels: - app: grafana -spec: - type: ClusterIP - ports: - - port: 3000 - targetPort: 3000 - protocol: TCP - name: http - selector: - app: grafana diff --git a/infrastructure/kubernetes/base/components/monitoring/ha-policies.yaml b/infrastructure/kubernetes/base/components/monitoring/ha-policies.yaml deleted file mode 100644 index f5443c3e..00000000 --- a/infrastructure/kubernetes/base/components/monitoring/ha-policies.yaml +++ /dev/null @@ -1,100 +0,0 @@ ---- -# PodDisruptionBudgets ensure minimum availability during voluntary disruptions -# (node drains, rolling updates, etc.) - -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: prometheus-pdb - namespace: monitoring -spec: - minAvailable: 1 - selector: - matchLabels: - app: prometheus - ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: alertmanager-pdb - namespace: monitoring -spec: - minAvailable: 2 - selector: - matchLabels: - app: alertmanager - ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: grafana-pdb - namespace: monitoring -spec: - minAvailable: 1 - selector: - matchLabels: - app: grafana - ---- -# ResourceQuota limits total resources in monitoring namespace -apiVersion: v1 -kind: ResourceQuota -metadata: - name: monitoring-quota - namespace: monitoring -spec: - hard: - # Compute resources - requests.cpu: "10" - requests.memory: "16Gi" - limits.cpu: "20" - limits.memory: "32Gi" - - # Storage - persistentvolumeclaims: "10" - requests.storage: "100Gi" - - # Object counts - pods: "50" - services: "20" - configmaps: "30" - secrets: "20" - ---- -# LimitRange sets default resource limits for pods in monitoring namespace -apiVersion: v1 -kind: LimitRange -metadata: - name: monitoring-limits - namespace: monitoring -spec: - limits: - # Default container limits - - max: - cpu: "2" - memory: "4Gi" - min: - cpu: "10m" - memory: "16Mi" - default: - cpu: "500m" - memory: "512Mi" - defaultRequest: - cpu: "100m" - memory: "128Mi" - type: Container - - # Pod limits - - max: - cpu: "4" - memory: "8Gi" - type: Pod - - # PVC limits - - max: - storage: "50Gi" - min: - storage: "1Gi" - type: PersistentVolumeClaim diff --git a/infrastructure/kubernetes/base/components/monitoring/ingress.yaml b/infrastructure/kubernetes/base/components/monitoring/ingress.yaml deleted file mode 100644 index 5be8a584..00000000 --- a/infrastructure/kubernetes/base/components/monitoring/ingress.yaml +++ /dev/null @@ -1,42 +0,0 @@ ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: monitoring-ingress - namespace: monitoring - annotations: - nginx.ingress.kubernetes.io/rewrite-target: /$2 - nginx.ingress.kubernetes.io/ssl-redirect: "false" -spec: - rules: - - host: monitoring.bakery-ia.local - http: - paths: - - path: /grafana(/|$)(.*) - pathType: ImplementationSpecific - backend: - service: - name: grafana - port: - number: 3000 - - path: /prometheus(/|$)(.*) - pathType: ImplementationSpecific - backend: - service: - name: prometheus-external - port: - number: 9090 - - path: /jaeger(/|$)(.*) - pathType: ImplementationSpecific - backend: - service: - name: jaeger-query - port: - number: 16686 - - path: /alertmanager(/|$)(.*) - pathType: ImplementationSpecific - backend: - service: - name: alertmanager-external - port: - number: 9093 diff --git a/infrastructure/kubernetes/base/components/monitoring/jaeger.yaml b/infrastructure/kubernetes/base/components/monitoring/jaeger.yaml deleted file mode 100644 index 9c2e6744..00000000 --- a/infrastructure/kubernetes/base/components/monitoring/jaeger.yaml +++ /dev/null @@ -1,190 +0,0 @@ ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: jaeger - namespace: monitoring - labels: - app: jaeger -spec: - replicas: 1 - selector: - matchLabels: - app: jaeger - template: - metadata: - labels: - app: jaeger - spec: - containers: - - name: jaeger - image: jaegertracing/all-in-one:1.51 - env: - - name: COLLECTOR_ZIPKIN_HOST_PORT - value: ":9411" - - name: COLLECTOR_OTLP_ENABLED - value: "true" - - name: SPAN_STORAGE_TYPE - value: "badger" - - name: BADGER_EPHEMERAL - value: "false" - - name: BADGER_DIRECTORY_VALUE - value: "/badger/data" - - name: BADGER_DIRECTORY_KEY - value: "/badger/key" - ports: - - containerPort: 5775 - protocol: UDP - name: zipkin-compact - - containerPort: 6831 - protocol: UDP - name: jaeger-compact - - containerPort: 6832 - protocol: UDP - name: jaeger-binary - - containerPort: 5778 - protocol: TCP - name: config-rest - - containerPort: 16686 - protocol: TCP - name: query - - containerPort: 14250 - protocol: TCP - name: grpc - - containerPort: 14268 - protocol: TCP - name: c-tchan-trft - - containerPort: 14269 - protocol: TCP - name: admin-http - - containerPort: 9411 - protocol: TCP - name: zipkin - - containerPort: 4317 - protocol: TCP - name: otlp-grpc - - containerPort: 4318 - protocol: TCP - name: otlp-http - volumeMounts: - - name: jaeger-storage - mountPath: /badger - resources: - requests: - memory: "512Mi" - cpu: "250m" - limits: - memory: "1Gi" - cpu: "500m" - livenessProbe: - httpGet: - path: / - port: 14269 - initialDelaySeconds: 30 - periodSeconds: 10 - readinessProbe: - httpGet: - path: / - port: 14269 - initialDelaySeconds: 5 - periodSeconds: 5 - volumes: - - name: jaeger-storage - persistentVolumeClaim: - claimName: jaeger-storage - ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: jaeger-storage - namespace: monitoring -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 10Gi - ---- -apiVersion: v1 -kind: Service -metadata: - name: jaeger-query - namespace: monitoring - labels: - app: jaeger -spec: - type: ClusterIP - ports: - - port: 16686 - targetPort: 16686 - protocol: TCP - name: query - selector: - app: jaeger - ---- -apiVersion: v1 -kind: Service -metadata: - name: jaeger-collector - namespace: monitoring - labels: - app: jaeger -spec: - type: ClusterIP - ports: - - port: 14268 - targetPort: 14268 - protocol: TCP - name: c-tchan-trft - - port: 14250 - targetPort: 14250 - protocol: TCP - name: grpc - - port: 9411 - targetPort: 9411 - protocol: TCP - name: zipkin - - port: 4317 - targetPort: 4317 - protocol: TCP - name: otlp-grpc - - port: 4318 - targetPort: 4318 - protocol: TCP - name: otlp-http - selector: - app: jaeger - ---- -apiVersion: v1 -kind: Service -metadata: - name: jaeger-agent - namespace: monitoring - labels: - app: jaeger -spec: - type: ClusterIP - clusterIP: None - ports: - - port: 5775 - targetPort: 5775 - protocol: UDP - name: zipkin-compact - - port: 6831 - targetPort: 6831 - protocol: UDP - name: jaeger-compact - - port: 6832 - targetPort: 6832 - protocol: UDP - name: jaeger-binary - - port: 5778 - targetPort: 5778 - protocol: TCP - name: config-rest - selector: - app: jaeger diff --git a/infrastructure/kubernetes/base/components/monitoring/kustomization.yaml b/infrastructure/kubernetes/base/components/monitoring/kustomization.yaml index 224cbd24..618dfa10 100644 --- a/infrastructure/kubernetes/base/components/monitoring/kustomization.yaml +++ b/infrastructure/kubernetes/base/components/monitoring/kustomization.yaml @@ -1,18 +1,20 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization +# Minimal Monitoring Infrastructure +# SigNoz is now managed via Helm in the 'signoz' namespace +# This kustomization only maintains: +# - Namespace for legacy resources (if needed) +# - Node exporter for infrastructure metrics +# - PostgreSQL exporter for database metrics +# - Optional OTEL collector (can be disabled if using SigNoz's built-in collector) + resources: - namespace.yaml - secrets.yaml - - prometheus.yaml - - alert-rules.yaml - - alertmanager.yaml - - alertmanager-init.yaml - - grafana.yaml - - grafana-dashboards.yaml - - grafana-dashboards-extended.yaml - - postgres-exporter.yaml + # Exporters for metrics collection - node-exporter.yaml - - jaeger.yaml - - ha-policies.yaml - - ingress.yaml + - postgres-exporter.yaml + # Optional: Keep OTEL collector or use SigNoz's built-in one + # Uncomment if you want a dedicated OTEL collector in monitoring namespace + # - otel-collector.yaml diff --git a/infrastructure/kubernetes/base/components/monitoring/otel-collector.yaml b/infrastructure/kubernetes/base/components/monitoring/otel-collector.yaml new file mode 100644 index 00000000..c243d516 --- /dev/null +++ b/infrastructure/kubernetes/base/components/monitoring/otel-collector.yaml @@ -0,0 +1,167 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: otel-collector-config + namespace: monitoring +data: + otel-collector-config.yaml: | + extensions: + health_check: + endpoint: 0.0.0.0:13133 + + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + processors: + batch: + timeout: 10s + send_batch_size: 1024 + + # Memory limiter to prevent OOM + memory_limiter: + check_interval: 1s + limit_mib: 512 + spike_limit_mib: 128 + + exporters: + # Export metrics to Prometheus + prometheus: + endpoint: "0.0.0.0:8889" + namespace: otelcol + const_labels: + source: otel-collector + + # Export to SigNoz + otlp/signoz: + endpoint: "signoz-query-service.monitoring.svc.cluster.local:8080" + tls: + insecure: true + + # Logging exporter for debugging traces and logs + logging: + loglevel: info + sampling_initial: 5 + sampling_thereafter: 200 + + service: + extensions: [health_check] + pipelines: + # Traces pipeline: receive -> process -> export to SigNoz + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [otlp/signoz, logging] + + # Metrics pipeline: receive -> process -> export to both Prometheus and SigNoz + metrics: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [prometheus, otlp/signoz] + + # Logs pipeline: receive -> process -> export to SigNoz + logs: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [otlp/signoz, logging] + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otel-collector + namespace: monitoring + labels: + app: otel-collector +spec: + replicas: 1 + selector: + matchLabels: + app: otel-collector + template: + metadata: + labels: + app: otel-collector + spec: + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:0.91.0 + args: + - --config=/conf/otel-collector-config.yaml + ports: + - containerPort: 4317 + protocol: TCP + name: otlp-grpc + - containerPort: 4318 + protocol: TCP + name: otlp-http + - containerPort: 8889 + protocol: TCP + name: prometheus + - containerPort: 13133 + protocol: TCP + name: health-check + volumeMounts: + - name: otel-collector-config + mountPath: /conf + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "500m" + livenessProbe: + httpGet: + path: / + port: 13133 + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: / + port: 13133 + initialDelaySeconds: 5 + periodSeconds: 5 + volumes: + - name: otel-collector-config + configMap: + name: otel-collector-config + items: + - key: otel-collector-config.yaml + path: otel-collector-config.yaml + +--- +apiVersion: v1 +kind: Service +metadata: + name: otel-collector + namespace: monitoring + labels: + app: otel-collector + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8889" + prometheus.io/path: "/metrics" +spec: + type: ClusterIP + ports: + - port: 4317 + targetPort: 4317 + protocol: TCP + name: otlp-grpc + - port: 4318 + targetPort: 4318 + protocol: TCP + name: otlp-http + - port: 8889 + targetPort: 8889 + protocol: TCP + name: prometheus + selector: + app: otel-collector diff --git a/infrastructure/kubernetes/base/components/monitoring/prometheus.yaml b/infrastructure/kubernetes/base/components/monitoring/prometheus.yaml deleted file mode 100644 index 0c1fce39..00000000 --- a/infrastructure/kubernetes/base/components/monitoring/prometheus.yaml +++ /dev/null @@ -1,278 +0,0 @@ ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: prometheus - namespace: monitoring - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: prometheus -rules: -- apiGroups: [""] - resources: - - nodes - - nodes/proxy - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: - - extensions - resources: - - ingresses - verbs: ["get", "list", "watch"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus -subjects: -- kind: ServiceAccount - name: prometheus - namespace: monitoring - ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-config - namespace: monitoring -data: - prometheus.yml: | - global: - scrape_interval: 30s - evaluation_interval: 30s - external_labels: - cluster: 'bakery-ia' - environment: 'production' - - # AlertManager configuration - alerting: - alertmanagers: - - static_configs: - - targets: - - alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093 - - alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093 - - alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093 - - # Load alert rules - rule_files: - - '/etc/prometheus/rules/*.yml' - - scrape_configs: - # Scrape Prometheus itself - - job_name: 'prometheus' - static_configs: - - targets: ['localhost:9090'] - - # Scrape all bakery-ia services - - job_name: 'bakery-services' - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - bakery-ia - relabel_configs: - # Only scrape pods with metrics port - - source_labels: [__meta_kubernetes_pod_container_port_name] - action: keep - regex: http - - # Add service name label - - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] - target_label: service - - # Add component label - - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component] - target_label: component - - # Add pod name - - source_labels: [__meta_kubernetes_pod_name] - target_label: pod - - # Set metrics path - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - # Set scrape port - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - target_label: __address__ - - # Scrape Kubernetes nodes - - job_name: 'kubernetes-nodes' - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics - - # Scrape AlertManager - - job_name: 'alertmanager' - static_configs: - - targets: - - alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093 - - alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093 - - alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093 - - # Scrape PostgreSQL exporter - - job_name: 'postgres-exporter' - static_configs: - - targets: ['postgres-exporter.monitoring.svc.cluster.local:9187'] - - # Scrape Node Exporter - - job_name: 'node-exporter' - kubernetes_sd_configs: - - role: node - relabel_configs: - - source_labels: [__address__] - regex: '(.*):10250' - replacement: '${1}:9100' - target_label: __address__ - - source_labels: [__meta_kubernetes_node_name] - target_label: node - ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: prometheus - namespace: monitoring - labels: - app: prometheus -spec: - serviceName: prometheus - replicas: 2 - selector: - matchLabels: - app: prometheus - template: - metadata: - labels: - app: prometheus - spec: - serviceAccountName: prometheus - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - prometheus - topologyKey: kubernetes.io/hostname - containers: - - name: prometheus - image: prom/prometheus:v3.0.1 - args: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - - '--storage.tsdb.retention.time=30d' - - '--web.console.libraries=/usr/share/prometheus/console_libraries' - - '--web.console.templates=/usr/share/prometheus/consoles' - - '--web.enable-lifecycle' - ports: - - containerPort: 9090 - name: web - volumeMounts: - - name: prometheus-config - mountPath: /etc/prometheus - - name: prometheus-rules - mountPath: /etc/prometheus/rules - - name: prometheus-storage - mountPath: /prometheus - resources: - requests: - memory: "1Gi" - cpu: "500m" - limits: - memory: "2Gi" - cpu: "1" - livenessProbe: - httpGet: - path: /-/healthy - port: 9090 - initialDelaySeconds: 30 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /-/ready - port: 9090 - initialDelaySeconds: 5 - periodSeconds: 5 - volumes: - - name: prometheus-config - configMap: - name: prometheus-config - - name: prometheus-rules - configMap: - name: prometheus-alert-rules - - volumeClaimTemplates: - - metadata: - name: prometheus-storage - spec: - accessModes: [ "ReadWriteOnce" ] - resources: - requests: - storage: 20Gi - ---- -apiVersion: v1 -kind: Service -metadata: - name: prometheus - namespace: monitoring - labels: - app: prometheus -spec: - type: ClusterIP - clusterIP: None - ports: - - port: 9090 - targetPort: 9090 - protocol: TCP - name: web - selector: - app: prometheus - ---- -apiVersion: v1 -kind: Service -metadata: - name: prometheus-external - namespace: monitoring - labels: - app: prometheus -spec: - type: ClusterIP - ports: - - port: 9090 - targetPort: 9090 - protocol: TCP - name: web - selector: - app: prometheus diff --git a/infrastructure/kubernetes/base/configmap.yaml b/infrastructure/kubernetes/base/configmap.yaml index 1141784e..63d2a516 100644 --- a/infrastructure/kubernetes/base/configmap.yaml +++ b/infrastructure/kubernetes/base/configmap.yaml @@ -14,9 +14,10 @@ data: DEBUG: "false" LOG_LEVEL: "INFO" - # Observability Settings - # Set to "true" when Jaeger/monitoring stack is deployed - ENABLE_TRACING: "false" + # Observability Settings - SigNoz enabled + ENABLE_TRACING: "true" + ENABLE_METRICS: "true" + ENABLE_LOGS: "true" # Database initialization settings # IMPORTANT: Services NEVER run migrations - they only verify DB is ready @@ -286,12 +287,11 @@ data: LOG_FILE_PATH: "/app/logs" LOG_ROTATION_SIZE: "100MB" LOG_RETENTION_DAYS: "30" - PROMETHEUS_ENABLED: "true" - PROMETHEUS_RETENTION: "200h" HEALTH_CHECK_TIMEOUT: "30" HEALTH_CHECK_INTERVAL: "30" - PROMETHEUS_RETENTION_DAYS: "30" - GRAFANA_ROOT_URL: "http://monitoring.bakery-ia.local/grafana" + + # Monitoring Configuration - SigNoz + SIGNOZ_ROOT_URL: "http://localhost/signoz" # ================================================================ # DATA COLLECTION SETTINGS @@ -382,16 +382,20 @@ data: NOMINATIM_CPU_LIMIT: "4" # ================================================================ - # DISTRIBUTED TRACING (Jaeger/OpenTelemetry) + # OBSERVABILITY - SigNoz (Unified Monitoring) # ================================================================ - JAEGER_COLLECTOR_ENDPOINT: "http://jaeger-collector.monitoring:4317" - JAEGER_AGENT_HOST: "jaeger-agent.monitoring" - JAEGER_AGENT_PORT: "6831" - OTEL_EXPORTER_OTLP_ENDPOINT: "http://jaeger-collector.monitoring:4317" + # OpenTelemetry Configuration - Direct to SigNoz + OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.signoz.svc.cluster.local:4317" + OTEL_EXPORTER_OTLP_PROTOCOL: "grpc" OTEL_SERVICE_NAME: "bakery-ia" + OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development" + + # SigNoz Endpoints + SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080" + SIGNOZ_FRONTEND_URL: "http://signoz-frontend.signoz.svc.cluster.local:3301" # ================================================================ - # REPLENISHMENT PLANNING SETTINGS + # REPLENISHMENT PLANNING SETTINGS # ================================================================ REPLENISHMENT_PROJECTION_HORIZON_DAYS: "7" REPLENISHMENT_SERVICE_LEVEL: "0.95" diff --git a/infrastructure/kubernetes/overlays/dev/kustomization.yaml b/infrastructure/kubernetes/overlays/dev/kustomization.yaml index b568a6f2..56a13f5e 100644 --- a/infrastructure/kubernetes/overlays/dev/kustomization.yaml +++ b/infrastructure/kubernetes/overlays/dev/kustomization.yaml @@ -9,11 +9,14 @@ metadata: resources: - ../../base - # Monitoring disabled for dev to save resources - # - ../../base/components/monitoring + # Monitoring enabled for dev environment + - ../../base/components/monitoring - dev-ingress.yaml + # SigNoz ingress is applied by Tilt (see Tiltfile) + # - signoz-ingress.yaml # Dev-Prod Parity: Enable HTTPS with self-signed certificates - dev-certificate.yaml + - monitoring-certificate.yaml - cluster-issuer-staging.yaml # Exclude nominatim from dev to save resources @@ -608,6 +611,39 @@ patches: limits: memory: "512Mi" cpu: "300m" + # Optional exporters resource patches for dev + - target: + group: apps + version: v1 + kind: DaemonSet + name: node-exporter + namespace: monitoring + patch: |- + - op: replace + path: /spec/template/spec/containers/0/resources + value: + requests: + memory: "32Mi" + cpu: "25m" + limits: + memory: "64Mi" + cpu: "100m" + - target: + group: apps + version: v1 + kind: Deployment + name: postgres-exporter + namespace: monitoring + patch: |- + - op: replace + path: /spec/template/spec/containers/0/resources + value: + requests: + memory: "32Mi" + cpu: "25m" + limits: + memory: "64Mi" + cpu: "100m" secretGenerator: - name: dev-secrets diff --git a/infrastructure/kubernetes/overlays/dev/monitoring-certificate.yaml b/infrastructure/kubernetes/overlays/dev/monitoring-certificate.yaml new file mode 100644 index 00000000..a51351fb --- /dev/null +++ b/infrastructure/kubernetes/overlays/dev/monitoring-certificate.yaml @@ -0,0 +1,49 @@ +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: bakery-dev-monitoring-tls-cert + namespace: monitoring +spec: + # Self-signed certificate for local development + secretName: bakery-ia-tls-cert + + # Certificate duration + duration: 2160h # 90 days + renewBefore: 360h # 15 days + + # Subject configuration + subject: + organizations: + - Bakery IA Development + + # Common name + commonName: localhost + + # DNS names this certificate is valid for + dnsNames: + - localhost + - monitoring.bakery-ia.local + + # IP addresses (for localhost) + ipAddresses: + - 127.0.0.1 + - ::1 + + # Use self-signed issuer for development + issuerRef: + name: selfsigned-issuer + kind: ClusterIssuer + group: cert-manager.io + + # Private key configuration + privateKey: + algorithm: RSA + encoding: PKCS1 + size: 2048 + + # Usages + usages: + - server auth + - client auth + - digital signature + - key encipherment diff --git a/infrastructure/kubernetes/overlays/dev/signoz-ingress.yaml b/infrastructure/kubernetes/overlays/dev/signoz-ingress.yaml new file mode 100644 index 00000000..54dc070c --- /dev/null +++ b/infrastructure/kubernetes/overlays/dev/signoz-ingress.yaml @@ -0,0 +1,39 @@ +--- +# SigNoz Ingress for Development (localhost) +# SigNoz is deployed via Helm in the 'signoz' namespace +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: signoz-ingress-localhost + namespace: signoz + annotations: + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + nginx.ingress.kubernetes.io/rewrite-target: /$2 + nginx.ingress.kubernetes.io/use-regex: "true" +spec: + ingressClassName: nginx + tls: + - hosts: + - localhost + secretName: bakery-ia-tls-cert + rules: + - host: localhost + http: + paths: + # SigNoz Frontend UI + - path: /signoz(/|$)(.*) + pathType: ImplementationSpecific + backend: + service: + name: signoz-frontend + port: + number: 3301 + # SigNoz Query Service API + - path: /signoz-api(/|$)(.*) + pathType: ImplementationSpecific + backend: + service: + name: signoz-query-service + port: + number: 8080 diff --git a/infrastructure/kubernetes/overlays/prod/kustomization.yaml b/infrastructure/kubernetes/overlays/prod/kustomization.yaml index 5f485110..9de6cfc3 100644 --- a/infrastructure/kubernetes/overlays/prod/kustomization.yaml +++ b/infrastructure/kubernetes/overlays/prod/kustomization.yaml @@ -14,6 +14,7 @@ resources: patchesStrategicMerge: - storage-patch.yaml + - monitoring-ingress-patch.yaml labels: - includeSelectors: true @@ -21,6 +22,89 @@ labels: environment: production tier: production +# SigNoz resource patches for production +patches: + # SigNoz ClickHouse production configuration + - target: + group: apps + version: v1 + kind: StatefulSet + name: signoz-clickhouse + namespace: signoz + patch: |- + - op: replace + path: /spec/replicas + value: 2 + - op: replace + path: /spec/template/spec/containers/0/resources + value: + requests: + memory: "2Gi" + cpu: "500m" + limits: + memory: "4Gi" + cpu: "1000m" + # SigNoz Query Service production configuration + - target: + group: apps + version: v1 + kind: Deployment + name: signoz-query-service + namespace: signoz + patch: |- + - op: replace + path: /spec/replicas + value: 2 + - op: replace + path: /spec/template/spec/containers/0/resources + value: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1000m" + # SigNoz AlertManager production configuration + - target: + group: apps + version: v1 + kind: Deployment + name: signoz-alertmanager + namespace: signoz + patch: |- + - op: replace + path: /spec/replicas + value: 2 + - op: replace + path: /spec/template/spec/containers/0/resources + value: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "500m" + # SigNoz Frontend production configuration + - target: + group: apps + version: v1 + kind: Deployment + name: signoz-frontend + namespace: signoz + patch: |- + - op: replace + path: /spec/replicas + value: 2 + - op: replace + path: /spec/template/spec/containers/0/resources + value: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "500m" + images: - name: bakery/auth-service newTag: latest diff --git a/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml b/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml index da373dbd..ddb40de6 100644 --- a/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml +++ b/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml @@ -17,14 +17,30 @@ data: REQUEST_TIMEOUT: "30" MAX_CONNECTIONS: "100" - # Monitoring - PROMETHEUS_ENABLED: "true" + # Monitoring - SigNoz (Unified Observability) ENABLE_TRACING: "true" ENABLE_METRICS: "true" - JAEGER_ENABLED: "true" - JAEGER_AGENT_HOST: "jaeger-agent.monitoring.svc.cluster.local" - JAEGER_AGENT_PORT: "6831" + ENABLE_LOGS: "true" + + # OpenTelemetry Configuration - Direct to SigNoz + OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.signoz.svc.cluster.local:4317" + OTEL_EXPORTER_OTLP_PROTOCOL: "grpc" + OTEL_SERVICE_NAME: "bakery-ia" + OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=production,cluster.name=bakery-ia-prod" + + # SigNoz Endpoints + SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080" + SIGNOZ_FRONTEND_URL: "https://monitoring.bakewise.ai/signoz" + SIGNOZ_ROOT_URL: "https://monitoring.bakewise.ai/signoz" # Rate Limiting (stricter in production) RATE_LIMIT_ENABLED: "true" RATE_LIMIT_PER_MINUTE: "60" + + # CORS Configuration for Production + CORS_ORIGINS: "https://bakewise.ai" + CORS_ALLOW_CREDENTIALS: "true" + + # Frontend Configuration + VITE_API_URL: "/api" + VITE_ENVIRONMENT: "production" diff --git a/infrastructure/kubernetes/overlays/prod/prod-ingress.yaml b/infrastructure/kubernetes/overlays/prod/prod-ingress.yaml index 0acbe64f..a3f7d690 100644 --- a/infrastructure/kubernetes/overlays/prod/prod-ingress.yaml +++ b/infrastructure/kubernetes/overlays/prod/prod-ingress.yaml @@ -16,7 +16,7 @@ metadata: # CORS configuration for production nginx.ingress.kubernetes.io/enable-cors: "true" - nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakery.yourdomain.com,https://api.yourdomain.com" + nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakewise.ai" nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS, PATCH" nginx.ingress.kubernetes.io/cors-allow-headers: "Content-Type, Authorization, X-Requested-With, Accept, Origin" nginx.ingress.kubernetes.io/cors-allow-credentials: "true" @@ -40,12 +40,10 @@ spec: ingressClassName: nginx tls: - hosts: - - bakery.yourdomain.com - - api.yourdomain.com - - monitoring.yourdomain.com + - bakewise.ai secretName: bakery-ia-prod-tls-cert rules: - - host: bakery.yourdomain.com + - host: bakewise.ai http: paths: - path: / @@ -55,7 +53,7 @@ spec: name: frontend-service port: number: 3000 - - path: /api + - path: /api/v1 pathType: Prefix backend: service: @@ -63,31 +61,4 @@ spec: port: number: 8000 - - host: api.yourdomain.com - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: gateway-service - port: - number: 8000 - - - host: monitoring.yourdomain.com - http: - paths: - - path: /grafana - pathType: Prefix - backend: - service: - name: grafana-service - port: - number: 3000 - - path: /prometheus - pathType: Prefix - backend: - service: - name: prometheus-service - port: - number: 9090 + # Monitoring (monitoring.bakewise.ai) is now handled by signoz-ingress.yaml in the signoz namespace diff --git a/infrastructure/kubernetes/overlays/prod/signoz-ingress.yaml b/infrastructure/kubernetes/overlays/prod/signoz-ingress.yaml new file mode 100644 index 00000000..fbedc444 --- /dev/null +++ b/infrastructure/kubernetes/overlays/prod/signoz-ingress.yaml @@ -0,0 +1,78 @@ +--- +# SigNoz Ingress for Production +# SigNoz is deployed via Helm in the 'signoz' namespace +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: signoz-ingress-prod + namespace: signoz + labels: + app.kubernetes.io/name: signoz + app.kubernetes.io/component: ingress + annotations: + # Nginx ingress controller annotations + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + nginx.ingress.kubernetes.io/proxy-body-size: "50m" + nginx.ingress.kubernetes.io/proxy-connect-timeout: "600" + nginx.ingress.kubernetes.io/proxy-send-timeout: "600" + nginx.ingress.kubernetes.io/proxy-read-timeout: "600" + nginx.ingress.kubernetes.io/rewrite-target: /$2 + nginx.ingress.kubernetes.io/use-regex: "true" + + # CORS configuration + nginx.ingress.kubernetes.io/enable-cors: "true" + nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakewise.ai,https://monitoring.bakewise.ai" + nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS, PATCH" + nginx.ingress.kubernetes.io/cors-allow-headers: "Content-Type, Authorization, X-Requested-With, Accept, Origin" + nginx.ingress.kubernetes.io/cors-allow-credentials: "true" + + # Security headers + nginx.ingress.kubernetes.io/configuration-snippet: | + more_set_headers "X-Frame-Options: SAMEORIGIN"; + more_set_headers "X-Content-Type-Options: nosniff"; + more_set_headers "X-XSS-Protection: 1; mode=block"; + more_set_headers "Referrer-Policy: strict-origin-when-cross-origin"; + + # Rate limiting + nginx.ingress.kubernetes.io/limit-rps: "100" + nginx.ingress.kubernetes.io/limit-connections: "50" + + # Cert-manager annotations for automatic certificate issuance + cert-manager.io/cluster-issuer: "letsencrypt-production" + cert-manager.io/acme-challenge-type: http01 + +spec: + ingressClassName: nginx + tls: + - hosts: + - monitoring.bakewise.ai + secretName: signoz-prod-tls-cert + rules: + - host: monitoring.bakewise.ai + http: + paths: + # SigNoz Frontend UI + - path: /signoz(/|$)(.*) + pathType: ImplementationSpecific + backend: + service: + name: signoz-frontend + port: + number: 3301 + # SigNoz Query Service API + - path: /signoz-api(/|$)(.*) + pathType: ImplementationSpecific + backend: + service: + name: signoz-query-service + port: + number: 8080 + # SigNoz AlertManager + - path: /signoz-alerts(/|$)(.*) + pathType: ImplementationSpecific + backend: + service: + name: signoz-alertmanager + port: + number: 9093 diff --git a/infrastructure/kubernetes/signoz-values.yaml b/infrastructure/kubernetes/signoz-values.yaml new file mode 100644 index 00000000..70aaacc1 --- /dev/null +++ b/infrastructure/kubernetes/signoz-values.yaml @@ -0,0 +1,79 @@ +# SigNoz Helm Chart Values - Customized for Bakery IA +# https://github.com/SigNoz/charts + +# Global settings +global: + storageClass: "standard" + +# Frontend configuration +frontend: + service: + type: ClusterIP + port: 3301 + ingress: + enabled: true + hosts: + - host: localhost + paths: + - path: /signoz + pathType: Prefix + annotations: + nginx.ingress.kubernetes.io/rewrite-target: /$2 + +# Query Service configuration +queryService: + replicaCount: 1 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 200m + memory: 512Mi + +# AlertManager configuration +alertmanager: + replicaCount: 1 + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 100m + memory: 256Mi + +# ClickHouse configuration +clickhouse: + persistence: + enabled: true + size: 10Gi + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 1000m + memory: 2Gi + +# OpenTelemetry Collector configuration +otelCollector: + enabled: true + config: + exporters: + otlp: + endpoint: "signoz-query-service:8080" + service: + pipelines: + traces: + receivers: [otlp] + exporters: [otlp] + metrics: + receivers: [otlp] + exporters: [otlp] + logs: + receivers: [otlp] + exporters: [otlp] + +# Resource optimization for development +# These can be increased for production +development: true \ No newline at end of file diff --git a/kubernetes_restart.sh b/kubernetes_restart.sh index 94c10bba..5166af36 100755 --- a/kubernetes_restart.sh +++ b/kubernetes_restart.sh @@ -228,6 +228,12 @@ setup() { if [ $? -eq 0 ]; then print_success "Colima started successfully" + + # Increase inotify limits for Colima to prevent "too many open files" errors + print_status "Increasing inotify limits in Colima VM..." + colima ssh --profile k8s-local "sudo sysctl -w fs.inotify.max_user_watches=524288" + colima ssh --profile k8s-local "sudo sysctl -w fs.inotify.max_user_instances=512" + print_success "Inotify limits increased" else print_error "Failed to start Colima" exit 1 @@ -261,23 +267,23 @@ setup() { # 4. Connect registry to Kind network connect_registry_to_kind - - # 3. Install NGINX Ingress Controller + + # 5. Install NGINX Ingress Controller print_status "Installing NGINX Ingress Controller..." - + # Apply the ingress-nginx manifest kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml - + if [ $? -eq 0 ]; then print_success "NGINX Ingress Controller manifest applied" else print_error "Failed to apply NGINX Ingress Controller manifest" exit 1 fi - + # Wait for ingress-nginx pods to be ready with retry logic wait_for_pods "ingress-nginx" "app.kubernetes.io/component=controller" 300 - + if [ $? -ne 0 ]; then print_error "NGINX Ingress Controller failed to become ready" print_status "Checking pod status for debugging..." @@ -285,30 +291,10 @@ setup() { kubectl describe pods -n ingress-nginx exit 1 fi - - # 4. Configure permanent localhost access - print_status "Configuring localhost access via NodePort..." - - # Check if service exists - if kubectl get svc ingress-nginx-controller -n ingress-nginx &>/dev/null; then - # Patch the service to expose NodePorts - kubectl patch svc ingress-nginx-controller \ - -n ingress-nginx \ - --type merge \ - -p '{"spec":{"type":"NodePort","ports":[{"name":"http","port":80,"targetPort":"http","nodePort":30080},{"name":"https","port":443,"targetPort":"https","nodePort":30443}]}}' - - if [ $? -eq 0 ]; then - print_success "NodePort configuration applied" - else - print_error "Failed to patch Ingress service" - exit 1 - fi - else - print_error "Ingress NGINX controller service not found" - exit 1 - fi - - # 5. Verify port mappings from kind-config.yaml + + print_success "NGINX Ingress Controller ready (using Kind's built-in NodePort configuration)" + + # 6. Verify port mappings from kind-config.yaml print_status "Verifying port mappings from configuration..." # Extract ports from kind-config.yaml @@ -323,24 +309,24 @@ setup() { echo " - Colima profile: k8s-local" echo " - Kind cluster: $CLUSTER_NAME" echo " - Local registry: localhost:5001" - echo " - Direct port mappings (from kind-config.yaml):" - echo " Frontend: localhost:3000 -> container:30300" - echo " Gateway: localhost:8000 -> container:30800" - echo " - Ingress access:" - echo " HTTP: localhost:${HTTP_HOST_PORT} -> ingress:30080" - echo " HTTPS: localhost:${HTTPS_HOST_PORT} -> ingress:30443" - echo " - NodePort access:" - echo " HTTP: localhost:30080" - echo " HTTPS: localhost:30443" - echo "----------------------------------------" - print_status "To access your applications:" - echo " - Use Ingress via: http://localhost:${HTTP_HOST_PORT}" - echo " - Direct NodePort: http://localhost:30080" + echo "" + print_status "Port Mappings (configured in kind-config.yaml):" + echo " - HTTP Ingress: localhost:${HTTP_HOST_PORT} -> Kind NodePort 30080" + echo " - HTTPS Ingress: localhost:${HTTPS_HOST_PORT} -> Kind NodePort 30443" + echo " - Frontend Direct: localhost:3000 -> container:30300" + echo " - Gateway Direct: localhost:8000 -> container:30800" + echo "" + print_status "How to access your application:" + echo " 1. Start Tilt: tilt up" + echo " 2. Access via:" + echo " - Ingress: http://localhost (or https://localhost)" + echo " - Direct: http://localhost:3000 (frontend), http://localhost:8000 (gateway)" + echo " - Tilt UI: http://localhost:10350" echo "----------------------------------------" print_status "Local Registry Information:" echo " - Registry URL: localhost:5001" - echo " - Images will be pushed to: localhost:5001/bakery/" - echo " - Update your Tiltfile with: default_registry('localhost:5001')" + echo " - Images pushed to: localhost:5001/bakery/" + echo " - Tiltfile already configured: default_registry('localhost:5001')" echo "----------------------------------------" } diff --git a/services/ai_insights/app/main.py b/services/ai_insights/app/main.py index bf07d33f..4c79b0de 100644 --- a/services/ai_insights/app/main.py +++ b/services/ai_insights/app/main.py @@ -1,22 +1,50 @@ """Main FastAPI application for AI Insights Service.""" -from fastapi import FastAPI +from fastapi import FastAPI, Response from fastapi.middleware.cors import CORSMiddleware from contextlib import asynccontextmanager import structlog +import os from app.core.config import settings from app.core.database import init_db, close_db from app.api import insights +from shared.monitoring.logging import setup_logging +from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware -# Configure structured logging -structlog.configure( - processors=[ - structlog.processors.TimeStamper(fmt="iso"), - structlog.processors.JSONRenderer() - ] -) +# OpenTelemetry imports +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor +from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor +from opentelemetry.instrumentation.redis import RedisInstrumentor +from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor +from opentelemetry.sdk.resources import Resource +# Configure OpenTelemetry tracing +def setup_tracing(service_name: str = "ai-insights"): + """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger""" + resource = Resource.create({"service.name": service_name}) + + otlp_exporter = OTLPSpanExporter( + endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"), + insecure=True + ) + + provider = TracerProvider(resource=resource) + processor = BatchSpanProcessor(otlp_exporter) + provider.add_span_processor(processor) + trace.set_tracer_provider(provider) + + return provider + +# Initialize tracing +tracer_provider = setup_tracing("ai-insights") + +# Setup logging +setup_logging("ai-insights", getattr(settings, 'LOG_LEVEL', 'INFO')) logger = structlog.get_logger() @@ -28,6 +56,10 @@ async def lifespan(app: FastAPI): await init_db() logger.info("Database initialized") + # Start metrics server + metrics_collector.start_metrics_server(8080) + logger.info("Metrics server started on port 8080") + yield # Shutdown @@ -44,6 +76,24 @@ app = FastAPI( lifespan=lifespan ) +# Instrument FastAPI with OpenTelemetry +FastAPIInstrumentor.instrument_app(app) + +# Instrument httpx for outgoing requests +HTTPXClientInstrumentor().instrument() + +# Instrument Redis +RedisInstrumentor().instrument() + +# Instrument SQLAlchemy +SQLAlchemyInstrumentor().instrument() + +# Initialize metrics collector +metrics_collector = MetricsCollector("ai-insights") + +# Add metrics middleware to track HTTP requests +add_metrics_middleware(app, metrics_collector) + # CORS middleware app.add_middleware( CORSMiddleware, @@ -81,6 +131,15 @@ async def health_check(): } +@app.get("/metrics") +async def metrics(): + """Prometheus metrics endpoint""" + return Response( + content=metrics_collector.get_metrics(), + media_type="text/plain; version=0.0.4; charset=utf-8" + ) + + if __name__ == "__main__": import uvicorn diff --git a/services/ai_insights/requirements.txt b/services/ai_insights/requirements.txt index 812ffb5c..8dffb182 100644 --- a/services/ai_insights/requirements.txt +++ b/services/ai_insights/requirements.txt @@ -29,6 +29,16 @@ pytz==2023.3 # Logging structlog==23.2.0 +# Monitoring and Observability +prometheus-client==0.23.1 +opentelemetry-api==1.27.0 +opentelemetry-sdk==1.27.0 +opentelemetry-instrumentation-fastapi==0.48b0 +opentelemetry-exporter-otlp-proto-grpc==1.27.0 +opentelemetry-instrumentation-httpx==0.48b0 +opentelemetry-instrumentation-redis==0.48b0 +opentelemetry-instrumentation-sqlalchemy==0.48b0 + # Machine Learning (for confidence scoring and impact estimation) numpy==1.26.2 pandas==2.1.3 diff --git a/services/alert_processor/app/main.py b/services/alert_processor/app/main.py index 7ded61bf..98614f5d 100644 --- a/services/alert_processor/app/main.py +++ b/services/alert_processor/app/main.py @@ -4,25 +4,52 @@ Alert Processor Service v2.0 Main FastAPI application with RabbitMQ consumer lifecycle management. """ -from fastapi import FastAPI +from fastapi import FastAPI, Response from fastapi.middleware.cors import CORSMiddleware from contextlib import asynccontextmanager import structlog +import os from app.core.config import settings from app.consumer.event_consumer import EventConsumer from app.api import alerts, sse from shared.redis_utils import initialize_redis, close_redis +from shared.monitoring.logging import setup_logging +from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware -# Configure structured logging -structlog.configure( - processors=[ - structlog.processors.TimeStamper(fmt="iso"), - structlog.processors.add_log_level, - structlog.processors.JSONRenderer() - ] -) +# OpenTelemetry imports +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor +from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor +from opentelemetry.instrumentation.redis import RedisInstrumentor +from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor +from opentelemetry.sdk.resources import Resource +# Configure OpenTelemetry tracing +def setup_tracing(service_name: str = "alert-processor"): + """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger""" + resource = Resource.create({"service.name": service_name}) + + otlp_exporter = OTLPSpanExporter( + endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"), + insecure=True + ) + + provider = TracerProvider(resource=resource) + processor = BatchSpanProcessor(otlp_exporter) + provider.add_span_processor(processor) + trace.set_tracer_provider(provider) + + return provider + +# Initialize tracing +tracer_provider = setup_tracing("alert-processor") + +# Setup logging +setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO')) logger = structlog.get_logger() # Global consumer instance @@ -54,6 +81,10 @@ async def lifespan(app: FastAPI): consumer = EventConsumer() await consumer.start() logger.info("alert_processor_started") + + # Start metrics server + metrics_collector.start_metrics_server(8080) + logger.info("Metrics server started on port 8080") except Exception as e: logger.error("alert_processor_startup_failed", error=str(e)) raise @@ -79,6 +110,24 @@ app = FastAPI( debug=settings.DEBUG ) +# Instrument FastAPI with OpenTelemetry +FastAPIInstrumentor.instrument_app(app) + +# Instrument httpx for outgoing requests +HTTPXClientInstrumentor().instrument() + +# Instrument Redis +RedisInstrumentor().instrument() + +# Instrument SQLAlchemy +SQLAlchemyInstrumentor().instrument() + +# Initialize metrics collector +metrics_collector = MetricsCollector("alert-processor") + +# Add metrics middleware to track HTTP requests +add_metrics_middleware(app, metrics_collector) + # CORS middleware app.add_middleware( CORSMiddleware, @@ -126,6 +175,15 @@ async def root(): } +@app.get("/metrics") +async def metrics(): + """Prometheus metrics endpoint""" + return Response( + content=metrics_collector.get_metrics(), + media_type="text/plain; version=0.0.4; charset=utf-8" + ) + + if __name__ == "__main__": import uvicorn diff --git a/services/alert_processor/requirements.txt b/services/alert_processor/requirements.txt index 1df0ecff..586655fb 100644 --- a/services/alert_processor/requirements.txt +++ b/services/alert_processor/requirements.txt @@ -32,3 +32,13 @@ python-dateutil==2.8.2 # Authentication python-jose[cryptography]==3.3.0 + +# Monitoring and Observability +prometheus-client==0.23.1 +opentelemetry-api==1.27.0 +opentelemetry-sdk==1.27.0 +opentelemetry-instrumentation-fastapi==0.48b0 +opentelemetry-exporter-otlp-proto-grpc==1.27.0 +opentelemetry-instrumentation-httpx==0.48b0 +opentelemetry-instrumentation-redis==0.48b0 +opentelemetry-instrumentation-sqlalchemy==0.48b0 diff --git a/services/demo_session/app/main.py b/services/demo_session/app/main.py index 7e9a26ae..61a15b79 100644 --- a/services/demo_session/app/main.py +++ b/services/demo_session/app/main.py @@ -3,16 +3,51 @@ Demo Session Service - Main Application Manages isolated demo sessions with ephemeral data """ -from fastapi import FastAPI, Request +from fastapi import FastAPI, Request, Response from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse import structlog from contextlib import asynccontextmanager +import os from app.core import settings, DatabaseManager from app.api import demo_sessions, demo_accounts, demo_operations, internal from shared.redis_utils import initialize_redis, close_redis +from shared.monitoring.logging import setup_logging +from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware +# OpenTelemetry imports +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor +from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor +from opentelemetry.instrumentation.redis import RedisInstrumentor +from opentelemetry.sdk.resources import Resource + +# Configure OpenTelemetry tracing +def setup_tracing(service_name: str = "demo-session"): + """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger""" + resource = Resource.create({"service.name": service_name}) + + otlp_exporter = OTLPSpanExporter( + endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"), + insecure=True + ) + + provider = TracerProvider(resource=resource) + processor = BatchSpanProcessor(otlp_exporter) + provider.add_span_processor(processor) + trace.set_tracer_provider(provider) + + return provider + +# Initialize tracing +tracer_provider = setup_tracing("demo-session") + +# Setup logging +setup_logging("demo-session", getattr(settings, 'LOG_LEVEL', 'INFO')) logger = structlog.get_logger() # Initialize database @@ -34,6 +69,10 @@ async def lifespan(app: FastAPI): max_connections=50 ) + # Start metrics server + metrics_collector.start_metrics_server(8080) + logger.info("Metrics server started on port 8080") + logger.info("Demo Session Service started successfully") yield @@ -52,6 +91,21 @@ app = FastAPI( lifespan=lifespan ) +# Instrument FastAPI with OpenTelemetry +FastAPIInstrumentor.instrument_app(app) + +# Instrument httpx for outgoing requests +HTTPXClientInstrumentor().instrument() + +# Instrument Redis +RedisInstrumentor().instrument() + +# Initialize metrics collector +metrics_collector = MetricsCollector("demo-session") + +# Add metrics middleware to track HTTP requests +add_metrics_middleware(app, metrics_collector) + # CORS middleware app.add_middleware( CORSMiddleware, @@ -110,6 +164,15 @@ async def health(): } +@app.get("/metrics") +async def metrics(): + """Prometheus metrics endpoint""" + return Response( + content=metrics_collector.get_metrics(), + media_type="text/plain; version=0.0.4; charset=utf-8" + ) + + if __name__ == "__main__": import uvicorn uvicorn.run( diff --git a/services/demo_session/requirements.txt b/services/demo_session/requirements.txt index bce4f4d0..ed933570 100644 --- a/services/demo_session/requirements.txt +++ b/services/demo_session/requirements.txt @@ -18,3 +18,11 @@ prometheus-client==0.23.1 aio-pika==9.4.3 email-validator==2.2.0 pytz==2024.2 + +# OpenTelemetry for distributed tracing +opentelemetry-api==1.27.0 +opentelemetry-sdk==1.27.0 +opentelemetry-instrumentation-fastapi==0.48b0 +opentelemetry-exporter-otlp-proto-grpc==1.27.0 +opentelemetry-instrumentation-httpx==0.48b0 +opentelemetry-instrumentation-redis==0.48b0