Add signoz

This commit is contained in:
Urtzi Alfaro
2026-01-08 12:58:00 +01:00
parent 07178f8972
commit dfb7e4b237
40 changed files with 2049 additions and 3935 deletions

146
Tiltfile
View File

@@ -36,6 +36,11 @@ Security Features:
✅ pgcrypto extension for encryption
✅ PostgreSQL audit logging
Monitoring:
📊 Service metrics available at /metrics endpoints
🔍 Telemetry ready (traces, metrics, logs)
SigNoz deployment optional for local dev (see signoz-info resource)
Applying security configurations...
""")
@@ -303,82 +308,131 @@ k8s_resource('redis', resource_deps=['security-setup'], labels=['01-infrastructu
k8s_resource('rabbitmq', labels=['01-infrastructure'])
k8s_resource('nominatim', labels=['01-infrastructure'])
# =============================================================================
# MONITORING RESOURCES - SigNoz (Unified Observability)
# =============================================================================
# Note: SigNoz Helm chart is complex for local dev
# For development, access SigNoz manually or use production Helm deployment
# To deploy SigNoz manually: ./infrastructure/helm/deploy-signoz.sh dev
local_resource(
'signoz-info',
cmd='''
echo "📊 SigNoz Monitoring Information"
echo ""
echo "SigNoz Helm deployment is disabled for local development due to complexity."
echo ""
echo "Options:"
echo "1. Deploy manually: ./infrastructure/helm/deploy-signoz.sh dev"
echo "2. Use production deployment: ./infrastructure/helm/deploy-signoz.sh prod"
echo "3. Skip monitoring for local development (use application metrics only)"
echo ""
echo "For simpler local monitoring, consider using just Prometheus+Grafana"
echo "or access metrics directly from services at /metrics endpoints."
''',
labels=['05-monitoring'],
auto_init=False,
trigger_mode=TRIGGER_MODE_MANUAL
)
# SigNoz ingress (only if manually deployed)
# Uncomment and trigger manually if you deploy SigNoz
# local_resource(
# 'signoz-ingress',
# cmd='''
# echo "🌐 Applying SigNoz ingress..."
# kubectl apply -f infrastructure/kubernetes/overlays/dev/signoz-ingress.yaml
# echo "✅ SigNoz ingress configured"
# ''',
# labels=['05-monitoring'],
# auto_init=False,
# trigger_mode=TRIGGER_MODE_MANUAL
# )
# Note: SigNoz components are managed by Helm and deployed outside of kustomize
# They will appear automatically once deployed, but we don't track them explicitly in Tilt
# to avoid startup errors. View them with: kubectl get pods -n signoz
# Optional exporters (in monitoring namespace)
k8s_resource('node-exporter', labels=['05-monitoring'])
k8s_resource('postgres-exporter', resource_deps=['auth-db'], labels=['05-monitoring'])
# =============================================================================
# DATABASE RESOURCES
# =============================================================================
# Core Service Databases
k8s_resource('auth-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('tenant-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('auth-db', resource_deps=['security-setup'], labels=['06-databases'])
k8s_resource('tenant-db', resource_deps=['security-setup'], labels=['06-databases'])
# Data & Analytics Databases
k8s_resource('training-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('forecasting-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('ai-insights-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('training-db', resource_deps=['security-setup'], labels=['06-databases'])
k8s_resource('forecasting-db', resource_deps=['security-setup'], labels=['06-databases'])
k8s_resource('ai-insights-db', resource_deps=['security-setup'], labels=['06-databases'])
# Operations Databases
k8s_resource('sales-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('inventory-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('production-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('procurement-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('distribution-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('sales-db', resource_deps=['security-setup'], labels=['06-databases'])
k8s_resource('inventory-db', resource_deps=['security-setup'], labels=['06-databases'])
k8s_resource('production-db', resource_deps=['security-setup'], labels=['06-databases'])
k8s_resource('procurement-db', resource_deps=['security-setup'], labels=['06-databases'])
k8s_resource('distribution-db', resource_deps=['security-setup'], labels=['06-databases'])
# Supporting Service Databases
k8s_resource('recipes-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('suppliers-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('pos-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('orders-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('external-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('recipes-db', resource_deps=['security-setup'], labels=['06-databases'])
k8s_resource('suppliers-db', resource_deps=['security-setup'], labels=['06-databases'])
k8s_resource('pos-db', resource_deps=['security-setup'], labels=['06-databases'])
k8s_resource('orders-db', resource_deps=['security-setup'], labels=['06-databases'])
k8s_resource('external-db', resource_deps=['security-setup'], labels=['06-databases'])
# Platform Service Databases
k8s_resource('notification-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('alert-processor-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('orchestrator-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('notification-db', resource_deps=['security-setup'], labels=['06-databases'])
k8s_resource('alert-processor-db', resource_deps=['security-setup'], labels=['06-databases'])
k8s_resource('orchestrator-db', resource_deps=['security-setup'], labels=['06-databases'])
# Demo Service Databases
k8s_resource('demo-session-db', resource_deps=['security-setup'], labels=['02-databases'])
k8s_resource('demo-session-db', resource_deps=['security-setup'], labels=['06-databases'])
# =============================================================================
# MIGRATION JOBS
# =============================================================================
# Core Service Migrations
k8s_resource('auth-migration', resource_deps=['auth-db'], labels=['03-migrations'])
k8s_resource('tenant-migration', resource_deps=['tenant-db'], labels=['03-migrations'])
k8s_resource('auth-migration', resource_deps=['auth-db'], labels=['07-migrations'])
k8s_resource('tenant-migration', resource_deps=['tenant-db'], labels=['07-migrations'])
# Data & Analytics Migrations
k8s_resource('training-migration', resource_deps=['training-db'], labels=['03-migrations'])
k8s_resource('forecasting-migration', resource_deps=['forecasting-db'], labels=['03-migrations'])
k8s_resource('ai-insights-migration', resource_deps=['ai-insights-db'], labels=['03-migrations'])
k8s_resource('training-migration', resource_deps=['training-db'], labels=['07-migrations'])
k8s_resource('forecasting-migration', resource_deps=['forecasting-db'], labels=['07-migrations'])
k8s_resource('ai-insights-migration', resource_deps=['ai-insights-db'], labels=['07-migrations'])
# Operations Migrations
k8s_resource('sales-migration', resource_deps=['sales-db'], labels=['03-migrations'])
k8s_resource('inventory-migration', resource_deps=['inventory-db'], labels=['03-migrations'])
k8s_resource('production-migration', resource_deps=['production-db'], labels=['03-migrations'])
k8s_resource('procurement-migration', resource_deps=['procurement-db'], labels=['03-migrations'])
k8s_resource('distribution-migration', resource_deps=['distribution-db'], labels=['03-migrations'])
k8s_resource('sales-migration', resource_deps=['sales-db'], labels=['07-migrations'])
k8s_resource('inventory-migration', resource_deps=['inventory-db'], labels=['07-migrations'])
k8s_resource('production-migration', resource_deps=['production-db'], labels=['07-migrations'])
k8s_resource('procurement-migration', resource_deps=['procurement-db'], labels=['07-migrations'])
k8s_resource('distribution-migration', resource_deps=['distribution-db'], labels=['07-migrations'])
# Supporting Service Migrations
k8s_resource('recipes-migration', resource_deps=['recipes-db'], labels=['03-migrations'])
k8s_resource('suppliers-migration', resource_deps=['suppliers-db'], labels=['03-migrations'])
k8s_resource('pos-migration', resource_deps=['pos-db'], labels=['03-migrations'])
k8s_resource('orders-migration', resource_deps=['orders-db'], labels=['03-migrations'])
k8s_resource('external-migration', resource_deps=['external-db'], labels=['03-migrations'])
k8s_resource('recipes-migration', resource_deps=['recipes-db'], labels=['07-migrations'])
k8s_resource('suppliers-migration', resource_deps=['suppliers-db'], labels=['07-migrations'])
k8s_resource('pos-migration', resource_deps=['pos-db'], labels=['07-migrations'])
k8s_resource('orders-migration', resource_deps=['orders-db'], labels=['07-migrations'])
k8s_resource('external-migration', resource_deps=['external-db'], labels=['07-migrations'])
# Platform Service Migrations
k8s_resource('notification-migration', resource_deps=['notification-db'], labels=['03-migrations'])
k8s_resource('alert-processor-migration', resource_deps=['alert-processor-db'], labels=['03-migrations'])
k8s_resource('orchestrator-migration', resource_deps=['orchestrator-db'], labels=['03-migrations'])
k8s_resource('notification-migration', resource_deps=['notification-db'], labels=['07-migrations'])
k8s_resource('alert-processor-migration', resource_deps=['alert-processor-db'], labels=['07-migrations'])
k8s_resource('orchestrator-migration', resource_deps=['orchestrator-db'], labels=['07-migrations'])
# Demo Service Migrations
k8s_resource('demo-session-migration', resource_deps=['demo-session-db'], labels=['03-migrations'])
k8s_resource('demo-session-migration', resource_deps=['demo-session-db'], labels=['07-migrations'])
# =============================================================================
# DATA INITIALIZATION JOBS
# =============================================================================
k8s_resource('external-data-init', resource_deps=['external-migration', 'redis'], labels=['04-data-init'])
k8s_resource('nominatim-init', labels=['04-data-init'])
k8s_resource('external-data-init', resource_deps=['external-migration', 'redis'], labels=['08-data-init'])
k8s_resource('nominatim-init', labels=['08-data-init'])
# =============================================================================
# =============================================================================
@@ -517,8 +571,16 @@ Internal Schedulers Active:
⏰ Usage Tracking: Daily @ 2:00 AM UTC (tenant-service)
Access your application:
Frontend: http://localhost:3000 (or via ingress)
Gateway: http://localhost:8000 (or via ingress)
Main Application: https://localhost
API Endpoints: https://localhost/api/v1/...
Service Metrics:
Gateway: http://localhost:8000/metrics
Any Service: kubectl port-forward <service> 8000:8000
SigNoz (Optional - see SIGNOZ_DEPLOYMENT_RECOMMENDATIONS.md):
Deploy manually: ./infrastructure/helm/deploy-signoz.sh dev
Access (if deployed): https://localhost/signoz
Verify security:
kubectl get pvc -n bakery-ia

View File

@@ -1,459 +0,0 @@
# 🎉 Production Monitoring MVP - Implementation Complete
**Date:** 2026-01-07
**Status:** ✅ READY FOR PRODUCTION DEPLOYMENT
---
## 📊 What Was Implemented
### **Phase 1: Core Infrastructure** ✅
-**Prometheus v3.0.1** (2 replicas, HA mode with StatefulSet)
-**AlertManager v0.27.0** (3 replicas, clustered with gossip protocol)
-**Grafana v12.3.0** (secure credentials via Kubernetes Secrets)
-**PostgreSQL Exporter v0.15.0** (database health monitoring)
-**Node Exporter v1.7.0** (infrastructure monitoring via DaemonSet)
-**Jaeger v1.51** (distributed tracing with persistent storage)
### **Phase 2: Alert Management** ✅
-**50+ Alert Rules** across 9 categories:
- Service health & performance
- Business logic (ML training, API limits)
- Alert system health & performance
- Database & infrastructure alerts
- Monitoring self-monitoring
-**Intelligent Alert Routing** by severity, component, and service
-**Alert Inhibition Rules** to prevent alert storms
-**Multi-Channel Notifications** (email + Slack support)
### **Phase 3: High Availability** ✅
-**PodDisruptionBudgets** for all monitoring components
-**Anti-affinity Rules** to spread pods across nodes
-**ResourceQuota & LimitRange** for namespace resource management
-**StatefulSets** with volumeClaimTemplates for persistent storage
-**Headless Services** for StatefulSet DNS discovery
### **Phase 4: Observability** ✅
-**11 Grafana Dashboards** (7 pre-configured + 4 extended):
1. Gateway Metrics
2. Services Overview
3. Circuit Breakers
4. PostgreSQL Database (13 panels)
5. Node Exporter Infrastructure (19 panels)
6. AlertManager Monitoring (15 panels)
7. Business Metrics & KPIs (21 panels)
8-11. Plus existing dashboards
-**Distributed Tracing** enabled in production
-**Comprehensive Documentation** with runbooks
---
## 📁 Files Created/Modified
### **New Files:**
```
infrastructure/kubernetes/base/components/monitoring/
├── secrets.yaml # Monitoring credentials
├── alertmanager.yaml # AlertManager StatefulSet (3 replicas)
├── alertmanager-init.yaml # Config initialization script
├── alert-rules.yaml # 50+ alert rules
├── postgres-exporter.yaml # PostgreSQL monitoring
├── node-exporter.yaml # Infrastructure monitoring (DaemonSet)
├── grafana-dashboards-extended.yaml # 4 comprehensive dashboards
├── ha-policies.yaml # PDBs + ResourceQuota + LimitRange
└── README.md # Complete documentation (500+ lines)
```
### **Modified Files:**
```
infrastructure/kubernetes/base/components/monitoring/
├── prometheus.yaml # Now StatefulSet with 2 replicas + alert config
├── grafana.yaml # Using secrets + extended dashboards mounted
├── ingress.yaml # Added /alertmanager path
└── kustomization.yaml # Added all new resources
infrastructure/kubernetes/overlays/prod/
├── kustomization.yaml # Enabled monitoring stack
└── prod-configmap.yaml # JAEGER_ENABLED=true
```
### **Deleted:**
```
infrastructure/monitoring/ # Old legacy config (completely removed)
```
---
## 🚀 Deployment Instructions
### **1. Update Secrets (REQUIRED BEFORE DEPLOYMENT)**
```bash
cd infrastructure/kubernetes/base/components/monitoring
# Generate strong Grafana password
GRAFANA_PASSWORD=$(openssl rand -base64 32)
# Update secrets.yaml with your actual values:
# - grafana-admin: admin-password
# - alertmanager-secrets: SMTP credentials
# - postgres-exporter: PostgreSQL connection string
# Example for production:
kubectl create secret generic grafana-admin \
--from-literal=admin-user=admin \
--from-literal=admin-password="${GRAFANA_PASSWORD}" \
--namespace monitoring --dry-run=client -o yaml | \
kubectl apply -f -
```
### **2. Deploy to Production**
```bash
# Apply the monitoring stack
kubectl apply -k infrastructure/kubernetes/overlays/prod
# Verify deployment
kubectl get pods -n monitoring
kubectl get pvc -n monitoring
kubectl get svc -n monitoring
```
### **3. Verify Services**
```bash
# Check Prometheus targets
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
# Visit: http://localhost:9090/targets
# Check AlertManager cluster
kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093
# Visit: http://localhost:9093
# Check Grafana dashboards
kubectl port-forward -n monitoring svc/grafana 3000:3000
# Visit: http://localhost:3000 (admin / YOUR_PASSWORD)
```
---
## 📈 What You Get Out of the Box
### **Monitoring Coverage:**
-**Application Metrics:** Request rates, latencies (P95/P99), error rates per service
-**Database Health:** Connections, transactions, cache hit ratio, slow queries, locks
-**Infrastructure:** CPU, memory, disk I/O, network traffic per node
-**Business KPIs:** Active tenants, training jobs, alert volumes, API health
-**Distributed Traces:** Full request path tracking across microservices
### **Alerting Capabilities:**
-**Service Down Detection:** 2-minute threshold with immediate notifications
-**Performance Degradation:** High latency, error rate, and memory alerts
-**Resource Exhaustion:** Database connections, disk space, memory limits
-**Business Logic:** Training job failures, low ML accuracy, rate limits
-**Alert System Health:** Component failures, delivery issues, capacity problems
### **High Availability:**
-**Prometheus:** 2 independent instances, can lose 1 without data loss
-**AlertManager:** 3-node cluster, requires 2/3 for alerts to fire
-**Monitoring Resilience:** PodDisruptionBudgets ensure service during updates
---
## 🔧 Configuration Highlights
### **Alert Routing (Configured in AlertManager):**
| Severity | Route | Repeat Interval |
|----------|-------|-----------------|
| Critical | critical-alerts@yourdomain.com + oncall@ | 4 hours |
| Warning | alerts@yourdomain.com | 12 hours |
| Info | alerts@yourdomain.com | 24 hours |
**Special Routes:**
- Alert system → alert-system-team@yourdomain.com
- Database alerts → database-team@yourdomain.com
- Infrastructure → infra-team@yourdomain.com
### **Resource Allocation:**
| Component | Replicas | CPU Request | Memory Request | Storage |
|-----------|----------|-------------|----------------|---------|
| Prometheus | 2 | 500m | 1Gi | 20Gi × 2 |
| AlertManager | 3 | 100m | 128Mi | 2Gi × 3 |
| Grafana | 1 | 100m | 256Mi | 5Gi |
| Postgres Exporter | 1 | 50m | 64Mi | - |
| Node Exporter | 1/node | 50m | 64Mi | - |
| Jaeger | 1 | 250m | 512Mi | 10Gi |
**Total Resources:**
- CPU Requests: ~2.5 cores
- Memory Requests: ~4Gi
- Storage: ~70Gi
### **Data Retention:**
- Prometheus: 30 days
- Jaeger: Persistent (BadgerDB)
- Grafana: Persistent dashboards
---
## 🔐 Security Considerations
### **Implemented:**
- ✅ Grafana credentials via Kubernetes Secrets (no hardcoded passwords)
- ✅ SMTP passwords stored in Secrets
- ✅ PostgreSQL connection strings in Secrets
- ✅ Read-only filesystem for Node Exporter
- ✅ Non-root user for Node Exporter (UID 65534)
- ✅ RBAC for Prometheus (ClusterRole with minimal permissions)
### **TODO for Production:**
- ⚠️ Use Sealed Secrets or External Secrets Operator
- ⚠️ Enable TLS for Prometheus remote write (if using)
- ⚠️ Configure Grafana LDAP/OAuth integration
- ⚠️ Set up proper certificate management for Ingress
- ⚠️ Review and tighten ResourceQuota limits
---
## 📊 Dashboard Access
### **Production URLs (via Ingress):**
```
https://monitoring.yourdomain.com/grafana # Grafana UI
https://monitoring.yourdomain.com/prometheus # Prometheus UI
https://monitoring.yourdomain.com/alertmanager # AlertManager UI
https://monitoring.yourdomain.com/jaeger # Jaeger UI
```
### **Local Access (Port Forwarding):**
```bash
# Grafana
kubectl port-forward -n monitoring svc/grafana 3000:3000
# Prometheus
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
# AlertManager
kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093
# Jaeger
kubectl port-forward -n monitoring svc/jaeger-query 16686:16686
```
---
## 🧪 Testing & Validation
### **1. Test Alert Flow:**
```bash
# Fire a test alert (HighMemoryUsage)
kubectl run memory-hog --image=polinux/stress --restart=Never \
--namespace=bakery-ia -- stress --vm 1 --vm-bytes 600M --timeout 300s
# Check alert in Prometheus (should fire within 5 minutes)
# Check AlertManager received it
# Verify email notification sent
```
### **2. Verify Metrics Collection:**
```bash
# Check Prometheus targets (should all be UP)
curl http://localhost:9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, health: .health}'
# Verify PostgreSQL metrics
curl http://localhost:9090/api/v1/query?query=pg_up | jq
# Verify Node metrics
curl http://localhost:9090/api/v1/query?query=node_cpu_seconds_total | jq
```
### **3. Test Jaeger Tracing:**
```bash
# Make a request through the gateway
curl -H "Authorization: Bearer YOUR_TOKEN" \
https://api.yourdomain.com/api/v1/health
# Check trace in Jaeger UI
# Should see spans across gateway → auth → tenant services
```
---
## 📖 Documentation
### **Complete Documentation Available:**
- **[README.md](infrastructure/kubernetes/base/components/monitoring/README.md)** - 500+ lines covering:
- Component overview
- Deployment instructions
- Security best practices
- Accessing services
- Dashboard descriptions
- Alert configuration
- Troubleshooting guide
- Metrics reference
- Backup & recovery procedures
- Maintenance tasks
---
## ⚡ Performance & Scalability
### **Current Capacity:**
- Prometheus can handle ~10M active time series
- AlertManager can process 1000s of alerts/second
- Jaeger can handle 10k spans/second
- Grafana supports 1000+ concurrent users
### **Scaling Recommendations:**
- **> 20M time series:** Deploy Thanos for long-term storage
- **> 5k alerts/min:** Scale AlertManager to 5+ replicas
- **> 50k spans/sec:** Deploy Jaeger with Elasticsearch/Cassandra backend
- **> 5k Grafana users:** Scale Grafana horizontally with shared database
---
## 🎯 Success Criteria - ALL MET ✅
- ✅ Prometheus collecting metrics from all services
- ✅ Alert rules evaluating and firing correctly
- ✅ AlertManager routing notifications to appropriate channels
- ✅ Grafana displaying real-time dashboards
- ✅ Jaeger capturing distributed traces
- ✅ High availability for all critical components
- ✅ Secure credential management
- ✅ Resource limits configured
- ✅ Documentation complete with runbooks
- ✅ No legacy code remaining
---
## 🚨 Important Notes
1. **Update Secrets Before Deployment:**
- Change all default passwords in `secrets.yaml`
- Use strong, randomly generated passwords
- Consider using Sealed Secrets for production
2. **Configure SMTP Settings:**
- Update AlertManager SMTP configuration in secrets
- Test email delivery before relying on alerts
3. **Review Alert Thresholds:**
- Current thresholds are conservative
- Adjust based on your SLAs and baseline metrics
4. **Monitor Resource Usage:**
- Prometheus storage grows over time
- Plan for capacity based on retention period
- Consider cleaning up old metrics
5. **Backup Strategy:**
- PVCs contain critical monitoring data
- Implement backup solution for PersistentVolumes
- Test restore procedures regularly
---
## 🎓 Next Steps (Post-MVP)
### **Short Term (1-2 weeks):**
1. Fine-tune alert thresholds based on production data
2. Add custom business metrics to services
3. Create team-specific dashboards
4. Set up on-call rotation in AlertManager
### **Medium Term (1-3 months):**
1. Implement SLO tracking and error budgets
2. Deploy Loki for log aggregation
3. Add anomaly detection for metrics
4. Integrate with incident management (PagerDuty/Opsgenie)
### **Long Term (3-6 months):**
1. Deploy Thanos for long-term metrics storage
2. Implement cost tracking and chargeback per tenant
3. Add continuous profiling (Pyroscope)
4. Build ML-based alert prediction
---
## 📞 Support & Troubleshooting
### **Common Issues:**
**Issue:** Prometheus targets showing "DOWN"
```bash
# Check service discovery
kubectl get svc -n bakery-ia
kubectl get endpoints -n bakery-ia
```
**Issue:** AlertManager not sending notifications
```bash
# Check SMTP connectivity
kubectl exec -n monitoring alertmanager-0 -- nc -zv smtp.gmail.com 587
# Check AlertManager logs
kubectl logs -n monitoring alertmanager-0 -f
```
**Issue:** Grafana dashboards showing "No Data"
```bash
# Verify Prometheus datasource
kubectl port-forward -n monitoring svc/grafana 3000:3000
# Login → Configuration → Data Sources → Test
# Check Prometheus has data
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
# Visit /graph and run query: up
```
### **Getting Help:**
- Check logs: `kubectl logs -n monitoring POD_NAME`
- Check events: `kubectl get events -n monitoring`
- Review documentation: `infrastructure/kubernetes/base/components/monitoring/README.md`
- Prometheus troubleshooting: https://prometheus.io/docs/prometheus/latest/troubleshooting/
- Grafana troubleshooting: https://grafana.com/docs/grafana/latest/troubleshooting/
---
## ✅ Deployment Checklist
Before going to production, verify:
- [ ] All secrets updated with production values
- [ ] SMTP configuration tested and working
- [ ] Grafana admin password changed from default
- [ ] PostgreSQL connection string configured
- [ ] Test alert fired and received via email
- [ ] All Prometheus targets are UP
- [ ] Grafana dashboards loading data
- [ ] Jaeger receiving traces
- [ ] Resource quotas appropriate for cluster size
- [ ] Backup strategy implemented for PVCs
- [ ] Team trained on accessing monitoring tools
- [ ] Runbooks reviewed and understood
- [ ] On-call rotation configured (if applicable)
---
## 🎉 Summary
**You now have a production-ready monitoring stack with:**
-**Complete Observability:** Metrics, logs (via stdout), and traces
-**Intelligent Alerting:** 50+ rules with smart routing and inhibition
-**Rich Visualization:** 11 dashboards covering all aspects of the system
-**High Availability:** HA for Prometheus and AlertManager
-**Security:** Secrets management, RBAC, read-only containers
-**Documentation:** Comprehensive guides and runbooks
-**Scalability:** Ready to handle production traffic
**The monitoring MVP is COMPLETE and READY FOR PRODUCTION DEPLOYMENT!** 🚀
---
*Generated: 2026-01-07*
*Version: 1.0.0 - Production MVP*
*Implementation Time: ~3 hours*

View File

@@ -584,23 +584,39 @@ docker push YOUR_VPS_IP:32000/bakery/auth-service
### Step 2: Update Production Configuration
```bash
# On local machine, edit these files:
The production configuration is already set up for **bakewise.ai** domain:
**Production URLs:**
- **Main Application:** https://bakewise.ai
- **API Endpoints:** https://bakewise.ai/api/v1/...
- **Monitoring Dashboard:** https://monitoring.bakewise.ai/grafana
- **Prometheus:** https://monitoring.bakewise.ai/prometheus
- **SigNoz (Traces/Metrics/Logs):** https://monitoring.bakewise.ai/signoz
- **AlertManager:** https://monitoring.bakewise.ai/alertmanager
```bash
# Verify the configuration is correct:
cat infrastructure/kubernetes/overlays/prod/prod-ingress.yaml | grep -A 3 "host:"
# Expected output should show:
# - host: bakewise.ai
# - host: monitoring.bakewise.ai
# Verify CORS configuration
cat infrastructure/kubernetes/overlays/prod/prod-configmap.yaml | grep CORS
# Expected: CORS_ORIGINS: "https://bakewise.ai"
```
**If using a different domain**, update these files:
```bash
# 1. Update domain names
nano infrastructure/kubernetes/overlays/prod/prod-ingress.yaml
# Replace:
# - bakery.yourdomain.com → bakery.your-actual-domain.com
# - api.yourdomain.com → api.your-actual-domain.com
# - monitoring.yourdomain.com → monitoring.your-actual-domain.com
# - Update CORS origins
# - Update cert-manager email
# Replace bakewise.ai with your domain
# 2. Update ConfigMap
nano infrastructure/kubernetes/overlays/prod/prod-configmap.yaml
# Set:
# - DOMAIN: "your-actual-domain.com"
# - CORS_ORIGINS: "https://bakery.your-actual-domain.com,https://www.your-actual-domain.com"
# Update CORS_ORIGINS
# 3. Verify image names (if using custom registry)
nano infrastructure/kubernetes/overlays/prod/kustomization.yaml
@@ -840,22 +856,96 @@ kubectl logs -n bakery-ia deployment/auth-service | grep -i "email\|smtp"
## Post-Deployment
### Step 1: Enable Monitoring
### Step 1: Access Monitoring Stack
```bash
# Monitoring is already configured, verify it's running
kubectl get pods -n monitoring
Your production monitoring stack provides complete observability with multiple tools:
# Access Grafana
kubectl port-forward -n monitoring svc/grafana 3000:3000
#### Production Monitoring URLs
# Visit http://localhost:3000
# Login: admin / (password from monitoring secrets)
# Check dashboards are working
Access via domain (recommended):
```
https://monitoring.bakewise.ai/grafana # Dashboards & visualization
https://monitoring.bakewise.ai/prometheus # Metrics & queries
https://monitoring.bakewise.ai/signoz # Unified observability platform (traces, metrics, logs)
https://monitoring.bakewise.ai/alertmanager # Alert management
```
### Step 2: Configure Backups
Or via port forwarding (if needed):
```bash
# Grafana
kubectl port-forward -n monitoring svc/grafana 3000:3000 &
# Prometheus
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 &
# SigNoz
kubectl port-forward -n monitoring svc/signoz-frontend 3301:3301 &
# AlertManager
kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093 &
```
#### Available Dashboards
Login to Grafana (admin / your-password) and explore:
**Main Dashboards:**
1. **Gateway Metrics** - HTTP request rates, latencies, error rates
2. **Services Overview** - Multi-service health and performance
3. **Circuit Breakers** - Reliability metrics
**Extended Dashboards:**
4. **Service Performance Monitoring (SPM)** - RED metrics from distributed traces
5. **PostgreSQL Database** - Database health, connections, query performance
6. **Node Exporter Infrastructure** - CPU, memory, disk, network per node
7. **AlertManager Monitoring** - Alert tracking and notification status
8. **Business Metrics & KPIs** - Tenant activity, ML jobs, forecasts
#### Quick Health Check
```bash
# Verify all monitoring pods are running
kubectl get pods -n monitoring
# Check Prometheus targets (all should be UP)
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
# Open: http://localhost:9090/targets
# View active alerts
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
# Open: http://localhost:9090/alerts
```
### Step 2: Configure Alerting
Update AlertManager with your notification email addresses:
```bash
# Edit alertmanager configuration
kubectl edit configmap -n monitoring alertmanager-config
# Update recipient emails in the routes section:
# - alerts@bakewise.ai (general alerts)
# - critical-alerts@bakewise.ai (critical issues)
# - oncall@bakewise.ai (on-call rotation)
```
Test alert delivery:
```bash
# Fire a test alert
kubectl run memory-test --image=polinux/stress --restart=Never \
--namespace=bakery-ia -- stress --vm 1 --vm-bytes 600M --timeout 300s
# Check alert appears in AlertManager
# https://monitoring.bakewise.ai/alertmanager
# Verify email notification received
# Clean up test
kubectl delete pod memory-test -n bakery-ia
```
### Step 3: Configure Backups
```bash
# Create backup script on VPS
@@ -902,26 +992,82 @@ kubectl edit configmap -n monitoring alertmanager-config
# Update recipient emails in the routes section
```
### Step 4: Document Everything
### Step 4: Verify Monitoring is Working
Create a runbook with:
- [ ] VPS login credentials (stored securely)
Before proceeding, ensure all monitoring components are operational:
```bash
# 1. Check Prometheus targets
# Open: https://monitoring.bakewise.ai/prometheus/targets
# All targets should show "UP" status
# 2. Verify Grafana dashboards load data
# Open: https://monitoring.bakewise.ai/grafana
# Navigate to any dashboard and verify metrics are displaying
# 3. Check SigNoz is receiving traces
# Open: https://monitoring.bakewise.ai/signoz
# Search for traces from "gateway" service
# 4. Verify AlertManager cluster
# Open: https://monitoring.bakewise.ai/alertmanager
# Check that all 3 AlertManager instances are connected
```
### Step 5: Document Everything
Create a secure runbook with all credentials and procedures:
**Essential Information to Document:**
- [ ] VPS login credentials (stored securely in password manager)
- [ ] Database passwords (in password manager)
- [ ] Domain registrar access
- [ ] Grafana admin password
- [ ] Domain registrar access (for bakewise.ai)
- [ ] Cloudflare access
- [ ] Email service credentials
- [ ] Email service credentials (SMTP)
- [ ] WhatsApp API credentials
- [ ] Docker Hub / Registry credentials
- [ ] Emergency contact information
- [ ] Rollback procedures
- [ ] Monitoring URLs and access procedures
### Step 5: Train Your Team
### Step 6: Train Your Team
- [ ] Show team how to access Grafana dashboards
- [ ] Demonstrate how to check logs: `kubectl logs`
- [ ] Explain how to restart services if needed
- [ ] Share this documentation with the team
- [ ] Setup on-call rotation (if applicable)
Conduct a training session covering:
- [ ] **Access monitoring dashboards**
- Show how to login to https://monitoring.bakewise.ai/grafana
- Walk through key dashboards (Services Overview, Database, Infrastructure)
- Explain how to interpret metrics and identify issues
- [ ] **Check application logs**
```bash
# View logs for a service
kubectl logs -n bakery-ia deployment/orders-service --tail=100 -f
# Search for errors
kubectl logs -n bakery-ia deployment/gateway | grep ERROR
```
- [ ] **Restart services when needed**
```bash
# Restart a service (rolling update, no downtime)
kubectl rollout restart deployment/orders-service -n bakery-ia
```
- [ ] **Respond to alerts**
- Show how to access AlertManager at https://monitoring.bakewise.ai/alertmanager
- Review common alerts and their resolution steps
- Reference the [Production Operations Guide](./PRODUCTION_OPERATIONS_GUIDE.md)
- [ ] **Share documentation**
- [PILOT_LAUNCH_GUIDE.md](./PILOT_LAUNCH_GUIDE.md) - This guide
- [PRODUCTION_OPERATIONS_GUIDE.md](./PRODUCTION_OPERATIONS_GUIDE.md) - Daily operations
- [security-checklist.md](./security-checklist.md) - Security procedures
- [ ] **Setup on-call rotation** (if applicable)
- Configure in AlertManager
- Document escalation procedures
---
@@ -1050,16 +1196,25 @@ kubectl scale deployment monitoring -n bakery-ia --replicas=0
## Support Resources
- **Full Monitoring Guide:** [MONITORING_DEPLOYMENT_SUMMARY.md](./MONITORING_DEPLOYMENT_SUMMARY.md)
- **Operations Guide:** [PRODUCTION_OPERATIONS_GUIDE.md](./PRODUCTION_OPERATIONS_GUIDE.md)
- **Security Guide:** [security-checklist.md](./security-checklist.md)
- **Database Security:** [database-security.md](./database-security.md)
- **TLS Configuration:** [tls-configuration.md](./tls-configuration.md)
**Documentation:**
- **Operations Guide:** [PRODUCTION_OPERATIONS_GUIDE.md](./PRODUCTION_OPERATIONS_GUIDE.md) - Daily operations, monitoring, incident response
- **Security Guide:** [security-checklist.md](./security-checklist.md) - Security procedures and compliance
- **Database Security:** [database-security.md](./database-security.md) - Database operations and TLS configuration
- **TLS Configuration:** [tls-configuration.md](./tls-configuration.md) - Certificate management
- **RBAC Implementation:** [rbac-implementation.md](./rbac-implementation.md) - Access control
**Monitoring Access:**
- **Grafana:** https://monitoring.bakewise.ai/grafana (admin / your-password)
- **Prometheus:** https://monitoring.bakewise.ai/prometheus
- **SigNoz:** https://monitoring.bakewise.ai/signoz
- **AlertManager:** https://monitoring.bakewise.ai/alertmanager
**External Resources:**
- **MicroK8s Docs:** https://microk8s.io/docs
- **Kubernetes Docs:** https://kubernetes.io/docs
- **Let's Encrypt:** https://letsencrypt.org/docs
- **Cloudflare DNS:** https://developers.cloudflare.com/dns
- **Monitoring Stack README:** infrastructure/kubernetes/base/components/monitoring/README.md
---

View File

@@ -32,7 +32,7 @@
- **Services:** 18 microservices, 14 databases, monitoring stack
- **Capacity:** 10-tenant pilot (scalable to 100+)
- **Security:** TLS encryption, RBAC, audit logging
- **Monitoring:** Prometheus, Grafana, AlertManager, Jaeger
- **Monitoring:** Prometheus, Grafana, AlertManager, SigNoz
**Key Metrics (10-tenant baseline):**
- **Uptime Target:** 99.5% (3.65 hours downtime/month)
@@ -60,10 +60,10 @@
**Production URLs:**
```
https://monitoring.yourdomain.com/grafana # Dashboards & visualization
https://monitoring.yourdomain.com/prometheus # Metrics & alerts
https://monitoring.yourdomain.com/alertmanager # Alert management
https://monitoring.yourdomain.com/jaeger # Distributed tracing
https://monitoring.bakewise.ai/grafana # Dashboards & visualization
https://monitoring.bakewise.ai/prometheus # Metrics & alerts
https://monitoring.bakewise.ai/alertmanager # Alert management
https://monitoring.bakewise.ai/signoz # Unified observability platform (traces, metrics, logs)
```
**Port Forwarding (if ingress not available):**
@@ -77,8 +77,8 @@ kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
# AlertManager
kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093
# Jaeger
kubectl port-forward -n monitoring svc/jaeger-query 16686:16686
# SigNoz
kubectl port-forward -n monitoring svc/signoz-frontend 3301:3301
```
### Key Dashboards
@@ -1099,13 +1099,12 @@ kubectl exec -n bakery-ia deployment/auth-db -- \
## Support Resources
**Documentation:**
- [Pilot Launch Guide](./PILOT_LAUNCH_GUIDE.md) - Initial deployment
- [Monitoring Summary](./MONITORING_DEPLOYMENT_SUMMARY.md) - Monitoring details
- [Quick Start Monitoring](./QUICK_START_MONITORING.md) - Monitoring setup
- [Security Checklist](./security-checklist.md) - Security procedures
- [Database Security](./database-security.md) - Database operations
- [Pilot Launch Guide](./PILOT_LAUNCH_GUIDE.md) - Initial deployment and setup
- [Security Checklist](./security-checklist.md) - Security procedures and compliance
- [Database Security](./database-security.md) - Database operations and best practices
- [TLS Configuration](./tls-configuration.md) - Certificate management
- [RBAC Implementation](./rbac-implementation.md) - Access control
- [RBAC Implementation](./rbac-implementation.md) - Access control configuration
- [Monitoring Stack README](../infrastructure/kubernetes/base/components/monitoring/README.md) - Detailed monitoring documentation
**External Resources:**
- Kubernetes: https://kubernetes.io/docs
@@ -1115,9 +1114,9 @@ kubectl exec -n bakery-ia deployment/auth-db -- \
- PostgreSQL: https://www.postgresql.org/docs
**Emergency Contacts:**
- DevOps Team: devops@yourdomain.com
- On-Call: oncall@yourdomain.com
- Security Team: security@yourdomain.com
- DevOps Team: devops@bakewise.ai
- On-Call: oncall@bakewise.ai
- Security Team: security@bakewise.ai
---

View File

@@ -1,284 +0,0 @@
# 🚀 Quick Start: Deploy Monitoring to Production
**Time to deploy: ~15 minutes**
---
## Step 1: Update Secrets (5 min)
```bash
cd infrastructure/kubernetes/base/components/monitoring
# 1. Generate strong passwords
GRAFANA_PASS=$(openssl rand -base64 32)
echo "Grafana Password: $GRAFANA_PASS" > ~/SAVE_THIS_PASSWORD.txt
# 2. Edit secrets.yaml and replace:
# - CHANGE_ME_IN_PRODUCTION (Grafana password)
# - SMTP settings (your email server)
# - PostgreSQL connection string (your DB)
nano secrets.yaml
```
**Required Changes in secrets.yaml:**
```yaml
# Line 13: Change Grafana password
admin-password: "YOUR_STRONG_PASSWORD_HERE"
# Lines 30-33: Update SMTP settings
smtp-host: "smtp.gmail.com:587"
smtp-username: "your-alerts@yourdomain.com"
smtp-password: "YOUR_SMTP_PASSWORD"
smtp-from: "alerts@yourdomain.com"
# Line 49: Update PostgreSQL connection
data-source-name: "postgresql://USER:PASSWORD@postgres.bakery-ia:5432/bakery?sslmode=require"
```
---
## Step 2: Update Alert Email Addresses (2 min)
```bash
# Edit alertmanager.yaml to set your team's email addresses
nano alertmanager.yaml
# Update these lines (search for @yourdomain.com):
# - Line 93: to: 'alerts@yourdomain.com'
# - Line 101: to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com'
# - Line 116: to: 'alerts@yourdomain.com'
# - Line 125: to: 'alert-system-team@yourdomain.com'
# - Line 134: to: 'database-team@yourdomain.com'
# - Line 143: to: 'infra-team@yourdomain.com'
```
---
## Step 3: Deploy to Production (3 min)
```bash
# Return to project root
cd /Users/urtzialfaro/Documents/bakery-ia
# Deploy the entire stack
kubectl apply -k infrastructure/kubernetes/overlays/prod
# Watch the pods come up
kubectl get pods -n monitoring -w
```
**Expected Output:**
```
NAME READY STATUS RESTARTS AGE
prometheus-0 1/1 Running 0 2m
prometheus-1 1/1 Running 0 1m
alertmanager-0 2/2 Running 0 2m
alertmanager-1 2/2 Running 0 1m
alertmanager-2 2/2 Running 0 1m
grafana-xxxxx 1/1 Running 0 2m
postgres-exporter-xxxxx 1/1 Running 0 2m
node-exporter-xxxxx 1/1 Running 0 2m
jaeger-xxxxx 1/1 Running 0 2m
```
---
## Step 4: Verify Deployment (3 min)
```bash
# Check all pods are running
kubectl get pods -n monitoring
# Check storage is provisioned
kubectl get pvc -n monitoring
# Check services are created
kubectl get svc -n monitoring
```
---
## Step 5: Access Dashboards (2 min)
### **Option A: Via Ingress (if configured)**
```
https://monitoring.yourdomain.com/grafana
https://monitoring.yourdomain.com/prometheus
https://monitoring.yourdomain.com/alertmanager
https://monitoring.yourdomain.com/jaeger
```
### **Option B: Via Port Forwarding**
```bash
# Grafana
kubectl port-forward -n monitoring svc/grafana 3000:3000 &
# Prometheus
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 &
# AlertManager
kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093 &
# Jaeger
kubectl port-forward -n monitoring svc/jaeger-query 16686:16686 &
# Now access:
# - Grafana: http://localhost:3000 (admin / YOUR_PASSWORD)
# - Prometheus: http://localhost:9090
# - AlertManager: http://localhost:9093
# - Jaeger: http://localhost:16686
```
---
## Step 6: Verify Everything Works (5 min)
### **Check Prometheus Targets**
1. Open Prometheus: http://localhost:9090
2. Go to Status → Targets
3. Verify all targets are **UP**:
- prometheus (1/1 up)
- bakery-services (multiple pods up)
- alertmanager (3/3 up)
- postgres-exporter (1/1 up)
- node-exporter (N/N up, where N = number of nodes)
### **Check Grafana Dashboards**
1. Open Grafana: http://localhost:3000
2. Login with admin / YOUR_PASSWORD
3. Go to Dashboards → Browse
4. You should see 11 dashboards:
- Bakery IA folder: Gateway Metrics, Services Overview, Circuit Breakers
- Bakery IA - Extended folder: PostgreSQL, Node Exporter, AlertManager, Business Metrics
5. Open any dashboard and verify data is loading
### **Test Alert Flow**
```bash
# Fire a test alert by creating high memory pod
kubectl run memory-test --image=polinux/stress --restart=Never \
--namespace=bakery-ia -- stress --vm 1 --vm-bytes 600M --timeout 300s
# Wait 5 minutes, then check:
# 1. Prometheus Alerts: http://localhost:9090/alerts
# - Should see "HighMemoryUsage" firing
# 2. AlertManager: http://localhost:9093
# - Should see the alert
# 3. Email inbox - Should receive notification
# Clean up
kubectl delete pod memory-test -n bakery-ia
```
### **Verify Jaeger Tracing**
1. Make a request to your API:
```bash
curl -H "Authorization: Bearer YOUR_TOKEN" \
https://api.yourdomain.com/api/v1/health
```
2. Open Jaeger: http://localhost:16686
3. Select a service from dropdown
4. Click "Find Traces"
5. You should see traces appearing
---
## ✅ Success Criteria
Your monitoring is working correctly if:
- [x] All Prometheus targets show "UP" status
- [x] Grafana dashboards display metrics
- [x] AlertManager cluster shows 3/3 members
- [x] Test alert fired and email received
- [x] Jaeger shows traces from services
- [x] No pods in CrashLoopBackOff state
- [x] All PVCs are Bound
---
## 🔧 Troubleshooting
### **Problem: Pods not starting**
```bash
# Check pod status
kubectl describe pod POD_NAME -n monitoring
# Check logs
kubectl logs POD_NAME -n monitoring
# Common issues:
# - Insufficient resources: Check node capacity
# - PVC not binding: Check storage class exists
# - Image pull errors: Check network/registry access
```
### **Problem: Prometheus targets DOWN**
```bash
# Check if services exist
kubectl get svc -n bakery-ia
# Check if pods have correct labels
kubectl get pods -n bakery-ia --show-labels
# Check if pods expose metrics port (8080)
kubectl get pod POD_NAME -n bakery-ia -o yaml | grep -A 5 ports
```
### **Problem: Grafana shows "No Data"**
```bash
# Test Prometheus datasource
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
# Run a test query in Prometheus
curl "http://localhost:9090/api/v1/query?query=up" | jq
# If Prometheus has data but Grafana doesn't, check Grafana datasource config
```
### **Problem: Alerts not firing**
```bash
# Check alert rules are loaded
kubectl logs -n monitoring prometheus-0 | grep "Loading configuration"
# Check AlertManager config
kubectl exec -n monitoring alertmanager-0 -- cat /etc/alertmanager/alertmanager.yml
# Test SMTP connection
kubectl exec -n monitoring alertmanager-0 -- \
nc -zv smtp.gmail.com 587
```
---
## 📞 Need Help?
1. Check full documentation: [infrastructure/kubernetes/base/components/monitoring/README.md](infrastructure/kubernetes/base/components/monitoring/README.md)
2. Review deployment summary: [MONITORING_DEPLOYMENT_SUMMARY.md](MONITORING_DEPLOYMENT_SUMMARY.md)
3. Check Prometheus logs: `kubectl logs -n monitoring prometheus-0`
4. Check AlertManager logs: `kubectl logs -n monitoring alertmanager-0`
5. Check Grafana logs: `kubectl logs -n monitoring deployment/grafana`
---
## 🎉 You're Done!
Your monitoring stack is now running in production!
**Next steps:**
1. Save your Grafana password securely
2. Set up on-call rotation
3. Review alert thresholds and adjust as needed
4. Create team-specific dashboards
5. Train team on using monitoring tools
**Access your monitoring:**
- Grafana: https://monitoring.yourdomain.com/grafana
- Prometheus: https://monitoring.yourdomain.com/prometheus
- AlertManager: https://monitoring.yourdomain.com/alertmanager
- Jaeger: https://monitoring.yourdomain.com/jaeger
---
*Deployment time: ~15 minutes*
*Last updated: 2026-01-07*

View File

@@ -10,7 +10,7 @@ import resource
import os
from fastapi import FastAPI, Request, HTTPException, Depends, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse
from fastapi.responses import JSONResponse, StreamingResponse, Response
import httpx
import time
from shared.redis_utils import initialize_redis, close_redis, get_redis_client
@@ -27,7 +27,42 @@ from app.middleware.demo_middleware import DemoMiddleware
from app.middleware.read_only_mode import ReadOnlyModeMiddleware
from app.routes import auth, tenant, notification, nominatim, subscription, demo, pos, geocoding, poi_context
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
# OpenTelemetry imports
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "gateway"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
# Create resource with service name
resource = Resource.create({"service.name": service_name})
# Configure OTLP exporter (sends to OpenTelemetry Collector)
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"),
insecure=True # Use insecure connection for internal cluster communication
)
# Configure tracer provider
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
# Set global tracer provider
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("gateway")
# Setup logging
setup_logging("gateway", settings.LOG_LEVEL)
@@ -75,9 +110,21 @@ app = FastAPI(
redirect_slashes=False # Disable automatic trailing slash redirects
)
# Instrument FastAPI with OpenTelemetry
FastAPIInstrumentor.instrument_app(app)
# Instrument httpx for outgoing requests
HTTPXClientInstrumentor().instrument()
# Instrument Redis (will be active once redis client is initialized)
RedisInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("gateway")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# Redis client for SSE streaming
redis_client = None
@@ -182,8 +229,11 @@ async def health_check():
@app.get("/metrics")
async def metrics():
"""Metrics endpoint for monitoring"""
return {"metrics": "enabled"}
"""Prometheus metrics endpoint"""
return Response(
content=metrics_collector.get_metrics(),
media_type="text/plain; version=0.0.4; charset=utf-8"
)
# ================================================================
# SERVER-SENT EVENTS (SSE) HELPER FUNCTIONS

View File

@@ -19,3 +19,9 @@ sqlalchemy==2.0.44
asyncpg==0.30.0
cryptography==44.0.0
ortools==9.8.3296
opentelemetry-api==1.27.0
opentelemetry-sdk==1.27.0
opentelemetry-instrumentation-fastapi==0.48b0
opentelemetry-exporter-otlp-proto-grpc==1.27.0
opentelemetry-instrumentation-httpx==0.48b0
opentelemetry-instrumentation-redis==0.48b0

View File

@@ -1,201 +0,0 @@
# Infrastructure Cleanup Summary
**Date:** 2026-01-07
**Action:** Removed legacy Docker Compose infrastructure files
---
## Deleted Directories and Files
The following legacy infrastructure files have been removed as they were specific to Docker Compose deployment and are **not used** in the Kubernetes deployment:
### ❌ Removed:
- `infrastructure/pgadmin/` - pgAdmin configuration for Docker Compose
- `pgpass` - Password file
- `servers.json` - Server definitions
- `infrastructure/postgres/` - PostgreSQL configuration for Docker Compose
- `init-scripts/init.sql` - Database initialization
- `infrastructure/rabbitmq/` - RabbitMQ configuration for Docker Compose
- `definitions.json` - Queue/exchange definitions
- `rabbitmq.conf` - RabbitMQ settings
- `infrastructure/redis/` - Redis configuration for Docker Compose
- `redis.conf` - Redis settings
- `infrastructure/terraform/` - Terraform infrastructure-as-code (unused)
- `base/`, `dev/`, `staging/`, `production/` directories
- `modules/` directory
- `infrastructure/rabbitmq.conf` - Standalone RabbitMQ config file
### ✅ Retained:
#### `infrastructure/kubernetes/`
**Purpose:** Complete Kubernetes deployment manifests
**Status:** Active and required
**Contents:**
- `base/` - Base Kubernetes resources
- `components/` - All service deployments
- `databases/` - Database deployments (uses embedded configs)
- `monitoring/` - Prometheus, Grafana, AlertManager
- `migrations/` - Database migration jobs
- `secrets/` - TLS secrets and application secrets
- `configmaps/` - PostgreSQL logging config
- `overlays/` - Environment-specific configurations
- `dev/` - Development overlay
- `prod/` - Production overlay
- `encryption/` - Kubernetes secrets encryption config
#### `infrastructure/tls/`
**Purpose:** TLS/SSL certificates for database encryption
**Status:** Active and required
**Contents:**
- `ca/` - Certificate Authority (10-year validity)
- `ca-cert.pem` - CA certificate
- `ca-key.pem` - CA private key (KEEP SECURE!)
- `postgres/` - PostgreSQL server certificates (3-year validity)
- `server-cert.pem`, `server-key.pem`, `ca-cert.pem`
- `redis/` - Redis server certificates (3-year validity)
- `redis-cert.pem`, `redis-key.pem`, `ca-cert.pem`
- `generate-certificates.sh` - Certificate generation script
---
## Why These Were Removed
### Docker Compose vs Kubernetes
The removed files were configuration files for **Docker Compose** deployments:
- pgAdmin was used for local database management (not needed in prod)
- Standalone config files (rabbitmq.conf, redis.conf, postgres init scripts) were mounted as volumes in Docker Compose
- Terraform was an unused infrastructure-as-code attempt
### Kubernetes Uses Different Approach
Kubernetes deployment uses:
- **ConfigMaps** instead of config files
- **Secrets** instead of environment files
- **Kubernetes manifests** instead of docker-compose.yml
- **Built-in orchestration** instead of Terraform
**Example:**
```yaml
# OLD (Docker Compose):
volumes:
- ./infrastructure/rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf
# NEW (Kubernetes):
env:
- name: RABBITMQ_DEFAULT_USER
valueFrom:
secretKeyRef:
name: rabbitmq-secrets
key: RABBITMQ_USER
```
---
## Verification
### No References Found
Searched entire codebase and confirmed **zero references** to removed folders:
```bash
grep -r "infrastructure/pgadmin" --include="*.yaml" --include="*.sh"
# No results
grep -r "infrastructure/terraform" --include="*.yaml" --include="*.sh"
# No results
```
### Kubernetes Deployment Unaffected
- All services use Kubernetes ConfigMaps and Secrets
- Database configs embedded in deployment YAML files
- TLS certificates managed via Kubernetes Secrets (from `infrastructure/tls/`)
---
## Current Infrastructure Structure
```
infrastructure/
├── kubernetes/ # ✅ ACTIVE - All K8s manifests
│ ├── base/ # Base resources
│ │ ├── components/ # Service deployments
│ │ ├── secrets/ # TLS secrets
│ │ ├── configmaps/ # Configuration
│ │ └── kustomization.yaml # Base kustomization
│ ├── overlays/ # Environment overlays
│ │ ├── dev/ # Development
│ │ └── prod/ # Production
│ └── encryption/ # K8s secrets encryption
└── tls/ # ✅ ACTIVE - TLS certificates
├── ca/ # Certificate Authority
├── postgres/ # PostgreSQL certs
├── redis/ # Redis certs
└── generate-certificates.sh
REMOVED (Docker Compose legacy):
├── pgadmin/ # ❌ DELETED
├── postgres/ # ❌ DELETED
├── rabbitmq/ # ❌ DELETED
├── redis/ # ❌ DELETED
├── terraform/ # ❌ DELETED
└── rabbitmq.conf # ❌ DELETED
```
---
## Impact Assessment
### ✅ No Breaking Changes
- Kubernetes deployment unchanged
- All services continue to work
- TLS certificates still available
- Production readiness maintained
### ✅ Benefits
- Cleaner repository structure
- Less confusion about which configs are used
- Faster repository cloning (smaller size)
- Clear separation: Kubernetes-only deployment
### ✅ Documentation Updated
- [PILOT_LAUNCH_GUIDE.md](../docs/PILOT_LAUNCH_GUIDE.md) - Uses only Kubernetes
- [PRODUCTION_OPERATIONS_GUIDE.md](../docs/PRODUCTION_OPERATIONS_GUIDE.md) - References only K8s resources
- [infrastructure/kubernetes/README.md](kubernetes/README.md) - K8s-specific documentation
---
## Rollback (If Needed)
If for any reason you need these files back, they can be restored from git:
```bash
# View deleted files
git log --diff-filter=D --summary | grep infrastructure
# Restore specific folder (example)
git checkout HEAD~1 -- infrastructure/pgadmin/
# Or restore all deleted infrastructure
git checkout HEAD~1 -- infrastructure/
```
**Note:** You won't need these for Kubernetes deployment. They were Docker Compose specific.
---
## Related Documentation
- [Kubernetes README](kubernetes/README.md) - K8s deployment guide
- [TLS Configuration](../docs/tls-configuration.md) - Certificate management
- [Database Security](../docs/database-security.md) - Database encryption
- [Pilot Launch Guide](../docs/PILOT_LAUNCH_GUIDE.md) - Production deployment
---
**Cleanup Performed By:** Claude Code
**Verified By:** Infrastructure analysis and grep searches
**Status:** ✅ Complete - No issues found

View File

@@ -0,0 +1,316 @@
# SigNoz Helm Chart Values - Development Environment
# Optimized for local development with minimal resource usage
#
# Official Chart: https://github.com/SigNoz/charts
# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-dev.yaml
global:
storageClass: "standard"
domain: "localhost"
# Frontend Configuration
frontend:
replicaCount: 1
image:
repository: signoz/frontend
tag: 0.52.3
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 3301
ingress:
enabled: true
className: nginx
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /$2
nginx.ingress.kubernetes.io/use-regex: "true"
hosts:
- host: localhost
paths:
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
tls: []
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
env:
- name: FRONTEND_REFRESH_INTERVAL
value: "30000"
# Query Service Configuration
queryService:
replicaCount: 1
image:
repository: signoz/query-service
tag: 0.52.3
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 8080
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
env:
- name: DEPLOYMENT_TYPE
value: "kubernetes-helm"
- name: SIGNOZ_LOCAL_DB_PATH
value: "/var/lib/signoz"
persistence:
enabled: true
size: 5Gi
storageClass: "standard"
# AlertManager Configuration
alertmanager:
replicaCount: 1
image:
repository: signoz/alertmanager
tag: 0.23.5
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 9093
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
persistence:
enabled: true
size: 2Gi
storageClass: "standard"
config:
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default'
receivers:
- name: 'default'
# Add email, slack, webhook configs here
# ClickHouse Configuration - Time Series Database
clickhouse:
replicaCount: 1
image:
repository: clickhouse/clickhouse-server
tag: 24.1.2-alpine
pullPolicy: IfNotPresent
service:
type: ClusterIP
httpPort: 8123
tcpPort: 9000
resources:
requests:
cpu: 500m
memory: 512Mi
limits:
cpu: 1000m
memory: 1Gi
persistence:
enabled: true
size: 10Gi
storageClass: "standard"
# ClickHouse configuration
config:
logger:
level: information
max_connections: 1024
max_concurrent_queries: 100
# Data retention (7 days for dev)
merge_tree:
parts_to_delay_insert: 150
parts_to_throw_insert: 300
# OpenTelemetry Collector - Integrated with SigNoz
otelCollector:
enabled: true
replicaCount: 1
image:
repository: signoz/signoz-otel-collector
tag: 0.102.8
pullPolicy: IfNotPresent
service:
type: ClusterIP
ports:
otlpGrpc: 4317
otlpHttp: 4318
metrics: 8888
healthCheck: 13133
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
# Full OTEL Collector Configuration
config:
extensions:
health_check:
endpoint: 0.0.0.0:13133
zpages:
endpoint: 0.0.0.0:55679
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
cors:
allowed_origins:
- "http://localhost"
- "https://localhost"
# Prometheus receiver for scraping metrics
prometheus:
config:
scrape_configs:
- job_name: 'otel-collector'
scrape_interval: 30s
static_configs:
- targets: ['localhost:8888']
processors:
batch:
timeout: 10s
send_batch_size: 1024
memory_limiter:
check_interval: 1s
limit_mib: 400
spike_limit_mib: 100
# Resource detection for K8s
resourcedetection:
detectors: [env, system, docker]
timeout: 5s
# Add resource attributes
resource:
attributes:
- key: deployment.environment
value: development
action: upsert
exporters:
# Export to SigNoz ClickHouse
clickhousetraces:
datasource: tcp://clickhouse:9000/?database=signoz_traces
timeout: 10s
clickhousemetricswrite:
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
timeout: 10s
clickhouselogsexporter:
dsn: tcp://clickhouse:9000/?database=signoz_logs
timeout: 10s
# Debug logging
logging:
loglevel: info
sampling_initial: 5
sampling_thereafter: 200
service:
extensions: [health_check, zpages]
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhousetraces, logging]
metrics:
receivers: [otlp, prometheus]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhousemetricswrite]
logs:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhouselogsexporter, logging]
# OpenTelemetry Collector Deployment Mode
otelCollectorDeployment:
enabled: true
mode: deployment
# Node Exporter for infrastructure metrics (optional)
nodeExporter:
enabled: true
service:
type: ClusterIP
port: 9100
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 100m
memory: 128Mi
# Schemamanager - Manages ClickHouse schema
schemamanager:
enabled: true
image:
repository: signoz/signoz-schema-migrator
tag: 0.52.3
pullPolicy: IfNotPresent
# Additional Configuration
serviceAccount:
create: true
annotations: {}
name: ""
# Security Context
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
# Network Policies (disabled for dev)
networkPolicy:
enabled: false
# Monitoring SigNoz itself
selfMonitoring:
enabled: true
serviceMonitor:
enabled: false

View File

@@ -0,0 +1,471 @@
# SigNoz Helm Chart Values - Production Environment
# High-availability configuration with resource optimization
#
# Official Chart: https://github.com/SigNoz/charts
# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-prod.yaml
global:
storageClass: "standard"
domain: "monitoring.bakewise.ai"
# Frontend Configuration
frontend:
replicaCount: 2
image:
repository: signoz/frontend
tag: 0.52.3
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 3301
ingress:
enabled: true
className: nginx
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /$2
nginx.ingress.kubernetes.io/use-regex: "true"
cert-manager.io/cluster-issuer: "letsencrypt-prod"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
hosts:
- host: monitoring.bakewise.ai
paths:
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
tls:
- secretName: signoz-tls
hosts:
- monitoring.bakewise.ai
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 500m
memory: 1Gi
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-frontend
topologyKey: kubernetes.io/hostname
env:
- name: FRONTEND_REFRESH_INTERVAL
value: "30000"
# Query Service Configuration
queryService:
replicaCount: 2
image:
repository: signoz/query-service
tag: 0.52.3
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 8080
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 1000m
memory: 2Gi
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-query-service
topologyKey: kubernetes.io/hostname
env:
- name: DEPLOYMENT_TYPE
value: "kubernetes-helm"
- name: SIGNOZ_LOCAL_DB_PATH
value: "/var/lib/signoz"
- name: RETENTION_DAYS
value: "30"
persistence:
enabled: true
size: 20Gi
storageClass: "standard"
# Horizontal Pod Autoscaler
autoscaling:
enabled: true
minReplicas: 2
maxReplicas: 5
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
# AlertManager Configuration
alertmanager:
replicaCount: 2
image:
repository: signoz/alertmanager
tag: 0.23.5
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 9093
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 500m
memory: 1Gi
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-alertmanager
topologyKey: kubernetes.io/hostname
persistence:
enabled: true
size: 5Gi
storageClass: "standard"
config:
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@bakewise.ai'
smtp_auth_username: 'alerts@bakewise.ai'
smtp_auth_password: '${SMTP_PASSWORD}'
smtp_require_tls: true
route:
group_by: ['alertname', 'cluster', 'service', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'critical-alerts'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
continue: true
- match:
severity: warning
receiver: 'warning-alerts'
receivers:
- name: 'critical-alerts'
email_configs:
- to: 'critical-alerts@bakewise.ai'
headers:
Subject: '[CRITICAL] {{ .GroupLabels.alertname }} - Bakery IA'
# Slack webhook for critical alerts
slack_configs:
- api_url: '${SLACK_WEBHOOK_URL}'
channel: '#alerts-critical'
title: '[CRITICAL] {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'warning-alerts'
email_configs:
- to: 'oncall@bakewise.ai'
headers:
Subject: '[WARNING] {{ .GroupLabels.alertname }} - Bakery IA'
# ClickHouse Configuration - Time Series Database
clickhouse:
replicaCount: 2
image:
repository: clickhouse/clickhouse-server
tag: 24.1.2-alpine
pullPolicy: IfNotPresent
service:
type: ClusterIP
httpPort: 8123
tcpPort: 9000
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
cpu: 2000m
memory: 4Gi
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-clickhouse
topologyKey: kubernetes.io/hostname
persistence:
enabled: true
size: 100Gi
storageClass: "standard"
# ClickHouse configuration
config:
logger:
level: information
max_connections: 4096
max_concurrent_queries: 500
# Data retention (30 days for prod)
merge_tree:
parts_to_delay_insert: 150
parts_to_throw_insert: 300
# Performance tuning
max_memory_usage: 10000000000
max_bytes_before_external_group_by: 20000000000
# Backup configuration
backup:
enabled: true
schedule: "0 2 * * *"
retention: 7
# OpenTelemetry Collector - Integrated with SigNoz
otelCollector:
enabled: true
replicaCount: 2
image:
repository: signoz/signoz-otel-collector
tag: 0.102.8
pullPolicy: IfNotPresent
service:
type: ClusterIP
ports:
otlpGrpc: 4317
otlpHttp: 4318
metrics: 8888
healthCheck: 13133
resources:
requests:
cpu: 500m
memory: 512Mi
limits:
cpu: 1000m
memory: 1Gi
# Full OTEL Collector Configuration
config:
extensions:
health_check:
endpoint: 0.0.0.0:13133
zpages:
endpoint: 0.0.0.0:55679
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
max_recv_msg_size_mib: 16
http:
endpoint: 0.0.0.0:4318
cors:
allowed_origins:
- "https://monitoring.bakewise.ai"
- "https://*.bakewise.ai"
# Prometheus receiver for scraping metrics
prometheus:
config:
scrape_configs:
- job_name: 'otel-collector'
scrape_interval: 30s
static_configs:
- targets: ['localhost:8888']
processors:
batch:
timeout: 10s
send_batch_size: 2048
send_batch_max_size: 4096
memory_limiter:
check_interval: 1s
limit_mib: 800
spike_limit_mib: 200
# Resource detection for K8s
resourcedetection:
detectors: [env, system, docker]
timeout: 5s
# Add resource attributes
resource:
attributes:
- key: deployment.environment
value: production
action: upsert
- key: cluster.name
value: bakery-ia-prod
action: upsert
exporters:
# Export to SigNoz ClickHouse
clickhousetraces:
datasource: tcp://clickhouse:9000/?database=signoz_traces
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
clickhousemetricswrite:
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
clickhouselogsexporter:
dsn: tcp://clickhouse:9000/?database=signoz_logs
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
# Minimal logging for prod
logging:
loglevel: warn
sampling_initial: 2
sampling_thereafter: 500
service:
extensions: [health_check, zpages]
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhousetraces, logging]
metrics:
receivers: [otlp, prometheus]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhousemetricswrite]
logs:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhouselogsexporter, logging]
# OpenTelemetry Collector Deployment Mode
otelCollectorDeployment:
enabled: true
mode: deployment
# HPA for OTEL Collector
autoscaling:
enabled: true
minReplicas: 2
maxReplicas: 10
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
# Node Exporter for infrastructure metrics
nodeExporter:
enabled: true
service:
type: ClusterIP
port: 9100
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
# Schemamanager - Manages ClickHouse schema
schemamanager:
enabled: true
image:
repository: signoz/signoz-schema-migrator
tag: 0.52.3
pullPolicy: IfNotPresent
# Additional Configuration
serviceAccount:
create: true
annotations: {}
name: "signoz"
# Security Context
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
# Pod Disruption Budgets for HA
podDisruptionBudget:
frontend:
enabled: true
minAvailable: 1
queryService:
enabled: true
minAvailable: 1
alertmanager:
enabled: true
minAvailable: 1
clickhouse:
enabled: true
minAvailable: 1
# Network Policies for security
networkPolicy:
enabled: true
policyTypes:
- Ingress
- Egress
# Monitoring SigNoz itself
selfMonitoring:
enabled: true
serviceMonitor:
enabled: true
interval: 30s

View File

@@ -4,7 +4,7 @@ This directory contains Kubernetes manifests for deploying the Bakery IA platfor
## Quick Start
Deploy the entire platform with these 5 commands:
Deploy the entire platform with these 4 commands:
```bash
# 1. Start Colima with adequate resources
@@ -17,15 +17,14 @@ kind create cluster --config kind-config.yaml
kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml
kubectl wait --namespace ingress-nginx --for=condition=ready pod --selector=app.kubernetes.io/component=controller --timeout=300s
# 4. Configure permanent localhost access
kubectl patch svc ingress-nginx-controller -n ingress-nginx -p '{"spec":{"type":"NodePort","ports":[{"name":"http","port":80,"targetPort":"http","nodePort":30080},{"name":"https","port":443,"targetPort":"https","nodePort":30443}]}}'
# 4. Deploy with Tilt
tilt up
# 5. Deploy with Skaffold
skaffold dev --profile=dev
# 🎉 Access at: https://localhost
# 🎉 Access at: http://localhost (or see Tilt for individual service ports)
```
> **Note**: The kind-config.yaml already configures port mappings (30080→80, 30443→443) for localhost access, so no additional service patching is needed. The NGINX Ingress for Kind uses NodePort by default on those exact ports.
## Prerequisites
Install the following tools on macOS:
@@ -100,11 +99,11 @@ Then access via:
### Start Development Environment
```bash
# Start development mode with hot-reload
skaffold dev --profile=dev
# Start development mode with hot-reload using Tilt
tilt up
# Or one-time deployment
skaffold run --profile=dev
# Or start in background
tilt up --stream
```
### Key Features
@@ -246,13 +245,39 @@ colima stop --profile k8s-local
### Restart Sequence
```bash
# Post-restart startup
# Post-restart startup (or use kubernetes_restart.sh script)
colima start --cpu 6 --memory 12 --disk 120 --runtime docker --profile k8s-local
kind create cluster --config kind-config.yaml
skaffold dev --profile=dev
kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml
kubectl wait --namespace ingress-nginx --for=condition=ready pod --selector=app.kubernetes.io/component=controller --timeout=300s
tilt up
```
## Production Considerations
## Production Deployment
### Production URLs
The production environment uses the following domains:
- **Main Application**: https://bakewise.ai
- Frontend application and all public pages
- API endpoints: https://bakewise.ai/api/v1/...
- **Monitoring Stack**: https://monitoring.bakewise.ai
- Grafana: https://monitoring.bakewise.ai/grafana
- Prometheus: https://monitoring.bakewise.ai/prometheus
- Jaeger: https://monitoring.bakewise.ai/jaeger
- AlertManager: https://monitoring.bakewise.ai/alertmanager
### Production Configuration
The production overlay (`overlays/prod/`) includes:
- **Domain Configuration**: bakewise.ai with Let's Encrypt certificates
- **High Availability**: Multi-replica deployments (2-3 replicas per service)
- **Enhanced Security**: Rate limiting, CORS restrictions, security headers
- **Monitoring**: Full observability stack with Prometheus, Grafana, Jaeger
### Production Considerations
For production deployment:
@@ -263,6 +288,7 @@ For production deployment:
- **External Secrets**: Use managed secret services
- **TLS**: Production Let's Encrypt certificates
- **CI/CD**: Automated deployment pipelines
- **DNS**: Configure DNS A/CNAME records pointing to your cluster's load balancer
## Next Steps

View File

@@ -48,6 +48,9 @@ spec:
name: pos-integration-secrets
- secretRef:
name: whatsapp-secrets
env:
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://otel-collector.monitoring.svc.cluster.local:4317"
resources:
requests:
memory: "256Mi"

View File

@@ -1,429 +0,0 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-alert-rules
namespace: monitoring
data:
alert-rules.yml: |
groups:
# Basic Infrastructure Alerts
- name: bakery_services
interval: 30s
rules:
- alert: ServiceDown
expr: up{job="bakery-services"} == 0
for: 2m
labels:
severity: critical
component: infrastructure
annotations:
summary: "Service {{ $labels.service }} is down"
description: "Service {{ $labels.service }} in namespace {{ $labels.namespace }} has been down for more than 2 minutes."
runbook_url: "https://runbooks.bakery-ia.local/ServiceDown"
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status_code=~"5..", job="bakery-services"}[5m])) by (service)
/
sum(rate(http_requests_total{job="bakery-services"}[5m])) by (service)
) > 0.10
for: 5m
labels:
severity: critical
component: application
annotations:
summary: "High error rate on {{ $labels.service }}"
description: "Service {{ $labels.service }} has error rate above 10% (current: {{ $value | humanizePercentage }})."
runbook_url: "https://runbooks.bakery-ia.local/HighErrorRate"
- alert: HighResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket{job="bakery-services"}[5m])) by (service, le)
) > 1
for: 5m
labels:
severity: warning
component: performance
annotations:
summary: "High response time on {{ $labels.service }}"
description: "Service {{ $labels.service }} P95 latency is above 1 second (current: {{ $value }}s)."
runbook_url: "https://runbooks.bakery-ia.local/HighResponseTime"
- alert: HighMemoryUsage
expr: |
container_memory_usage_bytes{namespace="bakery-ia", container!=""} > 500000000
for: 5m
labels:
severity: warning
component: infrastructure
annotations:
summary: "High memory usage in {{ $labels.pod }}"
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using more than 500MB of memory (current: {{ $value | humanize }}B)."
runbook_url: "https://runbooks.bakery-ia.local/HighMemoryUsage"
- alert: DatabaseConnectionHigh
expr: |
pg_stat_database_numbackends{datname="bakery"} > 80
for: 5m
labels:
severity: warning
component: database
annotations:
summary: "High database connection count"
description: "Database has more than 80 active connections (current: {{ $value }})."
runbook_url: "https://runbooks.bakery-ia.local/DatabaseConnectionHigh"
# Business Logic Alerts
- name: bakery_business
interval: 30s
rules:
- alert: TrainingJobFailed
expr: |
increase(training_job_failures_total[1h]) > 0
for: 5m
labels:
severity: warning
component: ml-training
annotations:
summary: "Training job failures detected"
description: "{{ $value }} training job(s) failed in the last hour."
runbook_url: "https://runbooks.bakery-ia.local/TrainingJobFailed"
- alert: LowPredictionAccuracy
expr: |
prediction_model_accuracy < 0.70
for: 15m
labels:
severity: warning
component: ml-inference
annotations:
summary: "Model prediction accuracy is low"
description: "Model {{ $labels.model_name }} accuracy is below 70% (current: {{ $value | humanizePercentage }})."
runbook_url: "https://runbooks.bakery-ia.local/LowPredictionAccuracy"
- alert: APIRateLimitHit
expr: |
increase(rate_limit_hits_total[5m]) > 10
for: 5m
labels:
severity: info
component: api-gateway
annotations:
summary: "API rate limits being hit frequently"
description: "Rate limits hit {{ $value }} times in the last 5 minutes."
runbook_url: "https://runbooks.bakery-ia.local/APIRateLimitHit"
# Alert System Health
- name: alert_system_health
interval: 30s
rules:
- alert: AlertSystemComponentDown
expr: |
alert_system_component_health{component=~"processor|notifier|scheduler"} == 0
for: 2m
labels:
severity: critical
component: alert-system
annotations:
summary: "Alert system component {{ $labels.component }} is unhealthy"
description: "Component {{ $labels.component }} has been unhealthy for more than 2 minutes."
runbook_url: "https://runbooks.bakery-ia.local/AlertSystemComponentDown"
- alert: RabbitMQConnectionDown
expr: |
rabbitmq_up == 0
for: 1m
labels:
severity: critical
component: alert-system
annotations:
summary: "RabbitMQ connection is down"
description: "Alert system has lost connection to RabbitMQ message queue."
runbook_url: "https://runbooks.bakery-ia.local/RabbitMQConnectionDown"
- alert: RedisConnectionDown
expr: |
redis_up == 0
for: 1m
labels:
severity: critical
component: alert-system
annotations:
summary: "Redis connection is down"
description: "Alert system has lost connection to Redis cache."
runbook_url: "https://runbooks.bakery-ia.local/RedisConnectionDown"
- alert: NoSchedulerLeader
expr: |
sum(alert_system_scheduler_leader) == 0
for: 5m
labels:
severity: warning
component: alert-system
annotations:
summary: "No alert scheduler leader elected"
description: "No scheduler instance has been elected as leader for 5 minutes."
runbook_url: "https://runbooks.bakery-ia.local/NoSchedulerLeader"
# Alert System Performance
- name: alert_system_performance
interval: 30s
rules:
- alert: HighAlertProcessingErrorRate
expr: |
(
sum(rate(alert_processing_errors_total[2m]))
/
sum(rate(alerts_processed_total[2m]))
) > 0.10
for: 2m
labels:
severity: critical
component: alert-system
annotations:
summary: "High alert processing error rate"
description: "Alert processing error rate is above 10% (current: {{ $value | humanizePercentage }})."
runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingErrorRate"
- alert: HighNotificationDeliveryFailureRate
expr: |
(
sum(rate(notification_delivery_failures_total[3m]))
/
sum(rate(notifications_sent_total[3m]))
) > 0.05
for: 3m
labels:
severity: warning
component: alert-system
annotations:
summary: "High notification delivery failure rate"
description: "Notification delivery failure rate is above 5% (current: {{ $value | humanizePercentage }})."
runbook_url: "https://runbooks.bakery-ia.local/HighNotificationDeliveryFailureRate"
- alert: HighAlertProcessingLatency
expr: |
histogram_quantile(0.95,
sum(rate(alert_processing_duration_seconds_bucket[5m])) by (le)
) > 5
for: 5m
labels:
severity: warning
component: alert-system
annotations:
summary: "High alert processing latency"
description: "P95 alert processing latency is above 5 seconds (current: {{ $value }}s)."
runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingLatency"
- alert: TooManySSEConnections
expr: |
sse_active_connections > 1000
for: 2m
labels:
severity: warning
component: alert-system
annotations:
summary: "Too many active SSE connections"
description: "More than 1000 active SSE connections (current: {{ $value }})."
runbook_url: "https://runbooks.bakery-ia.local/TooManySSEConnections"
- alert: SSEConnectionErrors
expr: |
rate(sse_connection_errors_total[3m]) > 0.5
for: 3m
labels:
severity: warning
component: alert-system
annotations:
summary: "High rate of SSE connection errors"
description: "SSE connection error rate is {{ $value }} errors/sec."
runbook_url: "https://runbooks.bakery-ia.local/SSEConnectionErrors"
# Alert System Business Logic
- name: alert_system_business
interval: 30s
rules:
- alert: UnusuallyHighAlertVolume
expr: |
rate(alerts_generated_total[5m]) > 2
for: 5m
labels:
severity: warning
component: alert-system
annotations:
summary: "Unusually high alert generation volume"
description: "More than 2 alerts per second being generated (current: {{ $value }}/sec)."
runbook_url: "https://runbooks.bakery-ia.local/UnusuallyHighAlertVolume"
- alert: NoAlertsGenerated
expr: |
rate(alerts_generated_total[30m]) == 0
for: 15m
labels:
severity: info
component: alert-system
annotations:
summary: "No alerts generated recently"
description: "No alerts have been generated in the last 30 minutes. This might indicate a problem with alert detection."
runbook_url: "https://runbooks.bakery-ia.local/NoAlertsGenerated"
- alert: SlowAlertResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(alert_response_time_seconds_bucket[10m])) by (le)
) > 3600
for: 10m
labels:
severity: warning
component: alert-system
annotations:
summary: "Slow alert response times"
description: "P95 alert response time is above 1 hour (current: {{ $value | humanizeDuration }})."
runbook_url: "https://runbooks.bakery-ia.local/SlowAlertResponseTime"
- alert: CriticalAlertsUnacknowledged
expr: |
sum(alerts_unacknowledged{severity="critical"}) > 5
for: 10m
labels:
severity: warning
component: alert-system
annotations:
summary: "Multiple critical alerts unacknowledged"
description: "{{ $value }} critical alerts have not been acknowledged for 10+ minutes."
runbook_url: "https://runbooks.bakery-ia.local/CriticalAlertsUnacknowledged"
# Alert System Capacity
- name: alert_system_capacity
interval: 30s
rules:
- alert: LargeSSEMessageQueues
expr: |
sse_message_queue_size > 100
for: 5m
labels:
severity: warning
component: alert-system
annotations:
summary: "Large SSE message queues detected"
description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages queued."
runbook_url: "https://runbooks.bakery-ia.local/LargeSSEMessageQueues"
- alert: SlowDatabaseStorage
expr: |
histogram_quantile(0.95,
sum(rate(alert_storage_duration_seconds_bucket[5m])) by (le)
) > 1
for: 5m
labels:
severity: warning
component: alert-system
annotations:
summary: "Slow alert database storage"
description: "P95 alert storage latency is above 1 second (current: {{ $value }}s)."
runbook_url: "https://runbooks.bakery-ia.local/SlowDatabaseStorage"
# Alert System Critical Scenarios
- name: alert_system_critical
interval: 15s
rules:
- alert: AlertSystemDown
expr: |
up{service=~"alert-processor|notification-service"} == 0
for: 1m
labels:
severity: critical
component: alert-system
annotations:
summary: "Alert system is completely down"
description: "Core alert system service {{ $labels.service }} is down."
runbook_url: "https://runbooks.bakery-ia.local/AlertSystemDown"
- alert: AlertDataNotPersisted
expr: |
(
sum(rate(alerts_processed_total[2m]))
-
sum(rate(alerts_stored_total[2m]))
) > 0
for: 2m
labels:
severity: critical
component: alert-system
annotations:
summary: "Alerts not being persisted to database"
description: "Alerts are being processed but not stored in the database."
runbook_url: "https://runbooks.bakery-ia.local/AlertDataNotPersisted"
- alert: NotificationsNotDelivered
expr: |
(
sum(rate(alerts_processed_total[3m]))
-
sum(rate(notifications_sent_total[3m]))
) > 0
for: 3m
labels:
severity: critical
component: alert-system
annotations:
summary: "Notifications not being delivered"
description: "Alerts are being processed but notifications are not being sent."
runbook_url: "https://runbooks.bakery-ia.local/NotificationsNotDelivered"
# Monitoring System Self-Monitoring
- name: monitoring_health
interval: 30s
rules:
- alert: PrometheusDown
expr: up{job="prometheus"} == 0
for: 5m
labels:
severity: critical
component: monitoring
annotations:
summary: "Prometheus is down"
description: "Prometheus monitoring system is not responding."
runbook_url: "https://runbooks.bakery-ia.local/PrometheusDown"
- alert: AlertManagerDown
expr: up{job="alertmanager"} == 0
for: 2m
labels:
severity: critical
component: monitoring
annotations:
summary: "AlertManager is down"
description: "AlertManager is not responding. Alerts will not be routed."
runbook_url: "https://runbooks.bakery-ia.local/AlertManagerDown"
- alert: PrometheusStorageFull
expr: |
(
prometheus_tsdb_storage_blocks_bytes
/
(prometheus_tsdb_storage_blocks_bytes + prometheus_tsdb_wal_size_bytes)
) > 0.90
for: 10m
labels:
severity: warning
component: monitoring
annotations:
summary: "Prometheus storage almost full"
description: "Prometheus storage is {{ $value | humanizePercentage }} full."
runbook_url: "https://runbooks.bakery-ia.local/PrometheusStorageFull"
- alert: PrometheusScrapeErrors
expr: |
rate(prometheus_target_scrapes_exceeded_sample_limit_total[5m]) > 0
for: 5m
labels:
severity: warning
component: monitoring
annotations:
summary: "Prometheus scrape errors detected"
description: "Prometheus is experiencing scrape errors for target {{ $labels.job }}."
runbook_url: "https://runbooks.bakery-ia.local/PrometheusScrapeErrors"

View File

@@ -1,27 +0,0 @@
---
# InitContainer to substitute secrets into AlertManager config
# This allows us to use environment variables from secrets in the config file
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-init-script
namespace: monitoring
data:
init-config.sh: |
#!/bin/sh
set -e
# Read the template config
TEMPLATE=$(cat /etc/alertmanager-template/alertmanager.yml)
# Substitute environment variables
echo "$TEMPLATE" | \
sed "s|{{ .smtp_host }}|${SMTP_HOST}|g" | \
sed "s|{{ .smtp_from }}|${SMTP_FROM}|g" | \
sed "s|{{ .smtp_username }}|${SMTP_USERNAME}|g" | \
sed "s|{{ .smtp_password }}|${SMTP_PASSWORD}|g" | \
sed "s|{{ .slack_webhook_url }}|${SLACK_WEBHOOK_URL}|g" \
> /etc/alertmanager-final/alertmanager.yml
echo "AlertManager config initialized successfully"
cat /etc/alertmanager-final/alertmanager.yml

View File

@@ -1,391 +0,0 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
namespace: monitoring
data:
alertmanager.yml: |
global:
resolve_timeout: 5m
smtp_smarthost: '{{ .smtp_host }}'
smtp_from: '{{ .smtp_from }}'
smtp_auth_username: '{{ .smtp_username }}'
smtp_auth_password: '{{ .smtp_password }}'
smtp_require_tls: true
# Define notification templates
templates:
- '/etc/alertmanager/templates/*.tmpl'
# Route alerts to appropriate receivers
route:
# Default receiver
receiver: 'default-email'
# Group alerts by these labels
group_by: ['alertname', 'cluster', 'service']
# Wait time before sending initial notification
group_wait: 10s
# Wait time before sending notifications about new alerts in the group
group_interval: 10s
# Wait time before re-sending a notification
repeat_interval: 12h
# Child routes for specific alert routing
routes:
# Critical alerts - send immediately to all channels
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 0s
group_interval: 5m
repeat_interval: 4h
continue: true
# Warning alerts - less urgent
- match:
severity: warning
receiver: 'warning-alerts'
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
# Alert system specific alerts
- match:
component: alert-system
receiver: 'alert-system-team'
group_wait: 10s
repeat_interval: 6h
# Database alerts
- match_re:
alertname: ^(DatabaseConnectionHigh|SlowDatabaseStorage)$
receiver: 'database-team'
group_wait: 30s
repeat_interval: 8h
# Infrastructure alerts
- match_re:
alertname: ^(HighMemoryUsage|ServiceDown)$
receiver: 'infra-team'
group_wait: 30s
repeat_interval: 6h
# Inhibition rules - prevent alert spam
inhibit_rules:
# If service is down, inhibit all other alerts for that service
- source_match:
alertname: 'ServiceDown'
target_match_re:
alertname: '(HighErrorRate|HighResponseTime|HighMemoryUsage)'
equal: ['service']
# If AlertSystem is completely down, inhibit component alerts
- source_match:
alertname: 'AlertSystemDown'
target_match_re:
alertname: 'AlertSystemComponent.*'
equal: ['namespace']
# If RabbitMQ is down, inhibit alert processing errors
- source_match:
alertname: 'RabbitMQConnectionDown'
target_match:
alertname: 'HighAlertProcessingErrorRate'
equal: ['namespace']
# Receivers - notification destinations
receivers:
# Default email receiver
- name: 'default-email'
email_configs:
- to: 'alerts@yourdomain.com'
headers:
Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
html: |
{{ range .Alerts }}
<h2>{{ .Labels.alertname }}</h2>
<p><strong>Status:</strong> {{ .Status }}</p>
<p><strong>Severity:</strong> {{ .Labels.severity }}</p>
<p><strong>Service:</strong> {{ .Labels.service }}</p>
<p><strong>Summary:</strong> {{ .Annotations.summary }}</p>
<p><strong>Description:</strong> {{ .Annotations.description }}</p>
<p><strong>Started:</strong> {{ .StartsAt }}</p>
{{ if .EndsAt }}<p><strong>Ended:</strong> {{ .EndsAt }}</p>{{ end }}
{{ end }}
# Critical alerts - multiple channels
- name: 'critical-alerts'
email_configs:
- to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com'
headers:
Subject: '🚨 [CRITICAL] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
send_resolved: true
# Uncomment to enable Slack notifications
# slack_configs:
# - api_url: '{{ .slack_webhook_url }}'
# channel: '#alerts-critical'
# title: '🚨 Critical Alert'
# text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
# send_resolved: true
# Warning alerts
- name: 'warning-alerts'
email_configs:
- to: 'alerts@yourdomain.com'
headers:
Subject: '⚠️ [WARNING] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
send_resolved: true
# Alert system team
- name: 'alert-system-team'
email_configs:
- to: 'alert-system-team@yourdomain.com'
headers:
Subject: '[Alert System] {{ .GroupLabels.alertname }}'
send_resolved: true
# Database team
- name: 'database-team'
email_configs:
- to: 'database-team@yourdomain.com'
headers:
Subject: '[Database] {{ .GroupLabels.alertname }}'
send_resolved: true
# Infrastructure team
- name: 'infra-team'
email_configs:
- to: 'infra-team@yourdomain.com'
headers:
Subject: '[Infrastructure] {{ .GroupLabels.alertname }}'
send_resolved: true
---
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-templates
namespace: monitoring
data:
default.tmpl: |
{{ define "cluster" }}{{ .ExternalURL | reReplaceAll ".*alertmanager\\.(.*)" "$1" }}{{ end }}
{{ define "slack.default.title" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.alertname }}
{{ end }}
{{ define "slack.default.text" }}
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Severity:* `{{ .Labels.severity }}`
*Service:* `{{ .Labels.service }}`
{{ end }}
{{ end }}
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: alertmanager
namespace: monitoring
labels:
app: alertmanager
spec:
serviceName: alertmanager
replicas: 3
selector:
matchLabels:
app: alertmanager
template:
metadata:
labels:
app: alertmanager
spec:
serviceAccountName: prometheus
initContainers:
- name: init-config
image: busybox:1.36
command: ['/bin/sh', '/scripts/init-config.sh']
env:
- name: SMTP_HOST
valueFrom:
secretKeyRef:
name: alertmanager-secrets
key: smtp-host
- name: SMTP_USERNAME
valueFrom:
secretKeyRef:
name: alertmanager-secrets
key: smtp-username
- name: SMTP_PASSWORD
valueFrom:
secretKeyRef:
name: alertmanager-secrets
key: smtp-password
- name: SMTP_FROM
valueFrom:
secretKeyRef:
name: alertmanager-secrets
key: smtp-from
- name: SLACK_WEBHOOK_URL
valueFrom:
secretKeyRef:
name: alertmanager-secrets
key: slack-webhook-url
optional: true
volumeMounts:
- name: init-script
mountPath: /scripts
- name: config-template
mountPath: /etc/alertmanager-template
- name: config-final
mountPath: /etc/alertmanager-final
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- alertmanager
topologyKey: kubernetes.io/hostname
containers:
- name: alertmanager
image: prom/alertmanager:v0.27.0
args:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--cluster.listen-address=0.0.0.0:9094'
- '--cluster.peer=alertmanager-0.alertmanager.monitoring.svc.cluster.local:9094'
- '--cluster.peer=alertmanager-1.alertmanager.monitoring.svc.cluster.local:9094'
- '--cluster.peer=alertmanager-2.alertmanager.monitoring.svc.cluster.local:9094'
- '--cluster.reconnect-timeout=5m'
- '--web.external-url=http://monitoring.bakery-ia.local/alertmanager'
- '--web.route-prefix=/'
ports:
- name: web
containerPort: 9093
- name: mesh-tcp
containerPort: 9094
- name: mesh-udp
containerPort: 9094
protocol: UDP
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- name: config-final
mountPath: /etc/alertmanager
- name: templates
mountPath: /etc/alertmanager/templates
- name: storage
mountPath: /alertmanager
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "500m"
livenessProbe:
httpGet:
path: /-/healthy
port: 9093
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /-/ready
port: 9093
initialDelaySeconds: 5
periodSeconds: 5
# Config reloader sidecar
- name: configmap-reload
image: jimmidyson/configmap-reload:v0.12.0
args:
- '--webhook-url=http://localhost:9093/-/reload'
- '--volume-dir=/etc/alertmanager'
volumeMounts:
- name: config-final
mountPath: /etc/alertmanager
readOnly: true
resources:
requests:
memory: "16Mi"
cpu: "10m"
limits:
memory: "32Mi"
cpu: "50m"
volumes:
- name: init-script
configMap:
name: alertmanager-init-script
defaultMode: 0755
- name: config-template
configMap:
name: alertmanager-config
- name: config-final
emptyDir: {}
- name: templates
configMap:
name: alertmanager-templates
volumeClaimTemplates:
- metadata:
name: storage
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 2Gi
---
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: monitoring
labels:
app: alertmanager
spec:
type: ClusterIP
clusterIP: None
ports:
- name: web
port: 9093
targetPort: 9093
- name: mesh-tcp
port: 9094
targetPort: 9094
- name: mesh-udp
port: 9094
targetPort: 9094
protocol: UDP
selector:
app: alertmanager
---
apiVersion: v1
kind: Service
metadata:
name: alertmanager-external
namespace: monitoring
labels:
app: alertmanager
spec:
type: ClusterIP
ports:
- name: web
port: 9093
targetPort: 9093
selector:
app: alertmanager

View File

@@ -1,949 +0,0 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards-extended
namespace: monitoring
data:
postgresql-dashboard.json: |
{
"dashboard": {
"title": "Bakery IA - PostgreSQL Database",
"tags": ["bakery-ia", "postgresql", "database"],
"timezone": "browser",
"refresh": "30s",
"schemaVersion": 16,
"version": 1,
"panels": [
{
"id": 1,
"title": "Active Connections by Database",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "pg_stat_activity_count{state=\"active\"}",
"legendFormat": "{{datname}} - active"
},
{
"expr": "pg_stat_activity_count{state=\"idle\"}",
"legendFormat": "{{datname}} - idle"
},
{
"expr": "pg_stat_activity_count{state=\"idle in transaction\"}",
"legendFormat": "{{datname}} - idle tx"
}
]
},
{
"id": 2,
"title": "Total Connections",
"type": "stat",
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "sum(pg_stat_activity_count)",
"legendFormat": "Total connections"
}
]
},
{
"id": 3,
"title": "Max Connections",
"type": "stat",
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "pg_settings_max_connections",
"legendFormat": "Max connections"
}
]
},
{
"id": 4,
"title": "Transaction Rate (Commits vs Rollbacks)",
"type": "graph",
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(pg_stat_database_xact_commit[5m])",
"legendFormat": "{{datname}} - commits"
},
{
"expr": "rate(pg_stat_database_xact_rollback[5m])",
"legendFormat": "{{datname}} - rollbacks"
}
]
},
{
"id": 5,
"title": "Cache Hit Ratio",
"type": "graph",
"gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (1 - (sum(rate(pg_stat_io_blocks_read_total[5m])) / (sum(rate(pg_stat_io_blocks_read_total[5m])) + sum(rate(pg_stat_io_blocks_hit_total[5m])))))",
"legendFormat": "Cache hit ratio %"
}
]
},
{
"id": 6,
"title": "Slow Queries (> 30s)",
"type": "table",
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "pg_slow_queries{duration_ms > 30000}",
"format": "table",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {},
"indexByName": {},
"renameByName": {
"query": "Query",
"duration_ms": "Duration (ms)",
"datname": "Database"
}
}
}
]
},
{
"id": 7,
"title": "Dead Tuples by Table",
"type": "graph",
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "pg_stat_user_tables_n_dead_tup",
"legendFormat": "{{schemaname}}.{{relname}}"
}
]
},
{
"id": 8,
"title": "Table Bloat Estimate",
"type": "graph",
"gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (pg_stat_user_tables_n_dead_tup * avg_tuple_size) / (pg_total_relation_size * 8192)",
"legendFormat": "{{schemaname}}.{{relname}} bloat %"
}
]
},
{
"id": 9,
"title": "Replication Lag (bytes)",
"type": "graph",
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
"targets": [
{
"expr": "pg_replication_lag_bytes",
"legendFormat": "{{slot_name}} - {{application_name}}"
}
]
},
{
"id": 10,
"title": "Database Size (GB)",
"type": "graph",
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
"targets": [
{
"expr": "pg_database_size_bytes / 1024 / 1024 / 1024",
"legendFormat": "{{datname}}"
}
]
},
{
"id": 11,
"title": "Database Size Growth (per hour)",
"type": "graph",
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(pg_database_size_bytes[1h])",
"legendFormat": "{{datname}} - bytes/hour"
}
]
},
{
"id": 12,
"title": "Lock Counts by Type",
"type": "graph",
"gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
"targets": [
{
"expr": "pg_locks_count",
"legendFormat": "{{datname}} - {{locktype}} - {{mode}}"
}
]
},
{
"id": 13,
"title": "Query Duration (p95)",
"type": "graph",
"gridPos": {"x": 12, "y": 40, "w": 12, "h": 8},
"targets": [
{
"expr": "histogram_quantile(0.95, rate(pg_query_duration_seconds_bucket[5m]))",
"legendFormat": "p95"
}
]
}
]
}
}
node-exporter-dashboard.json: |
{
"dashboard": {
"title": "Bakery IA - Node Exporter Infrastructure",
"tags": ["bakery-ia", "node-exporter", "infrastructure"],
"timezone": "browser",
"refresh": "15s",
"schemaVersion": 16,
"version": 1,
"panels": [
{
"id": 1,
"title": "CPU Usage by Node",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{instance}} - {{cpu}}"
}
]
},
{
"id": 2,
"title": "Average CPU Usage",
"type": "stat",
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "Average CPU %"
}
]
},
{
"id": 3,
"title": "CPU Load (1m, 5m, 15m)",
"type": "stat",
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "avg(node_load1)",
"legendFormat": "1m"
},
{
"expr": "avg(node_load5)",
"legendFormat": "5m"
},
{
"expr": "avg(node_load15)",
"legendFormat": "15m"
}
]
},
{
"id": 4,
"title": "Memory Usage by Node",
"type": "graph",
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
"legendFormat": "{{instance}}"
}
]
},
{
"id": 5,
"title": "Memory Used (GB)",
"type": "stat",
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
"targets": [
{
"expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024",
"legendFormat": "{{instance}}"
}
]
},
{
"id": 6,
"title": "Memory Available (GB)",
"type": "stat",
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
"targets": [
{
"expr": "node_memory_MemAvailable_bytes / 1024 / 1024 / 1024",
"legendFormat": "{{instance}}"
}
]
},
{
"id": 7,
"title": "Disk I/O Read Rate (MB/s)",
"type": "graph",
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_disk_read_bytes_total[5m]) / 1024 / 1024",
"legendFormat": "{{instance}} - {{device}}"
}
]
},
{
"id": 8,
"title": "Disk I/O Write Rate (MB/s)",
"type": "graph",
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_disk_written_bytes_total[5m]) / 1024 / 1024",
"legendFormat": "{{instance}} - {{device}}"
}
]
},
{
"id": 9,
"title": "Disk I/O Operations (IOPS)",
"type": "graph",
"gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_disk_reads_completed_total[5m]) + rate(node_disk_writes_completed_total[5m])",
"legendFormat": "{{instance}} - {{device}}"
}
]
},
{
"id": 10,
"title": "Network Receive Rate (Mbps)",
"type": "graph",
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
"legendFormat": "{{instance}} - {{device}}"
}
]
},
{
"id": 11,
"title": "Network Transmit Rate (Mbps)",
"type": "graph",
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
"legendFormat": "{{instance}} - {{device}}"
}
]
},
{
"id": 12,
"title": "Network Errors",
"type": "graph",
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_network_receive_errs_total[5m]) + rate(node_network_transmit_errs_total[5m])",
"legendFormat": "{{instance}} - {{device}}"
}
]
},
{
"id": 13,
"title": "Filesystem Usage by Mount",
"type": "graph",
"gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes))",
"legendFormat": "{{instance}} - {{mountpoint}}"
}
]
},
{
"id": 14,
"title": "Filesystem Available (GB)",
"type": "stat",
"gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
"targets": [
{
"expr": "node_filesystem_avail_bytes / 1024 / 1024 / 1024",
"legendFormat": "{{instance}} - {{mountpoint}}"
}
]
},
{
"id": 15,
"title": "Filesystem Size (GB)",
"type": "stat",
"gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
"targets": [
{
"expr": "node_filesystem_size_bytes / 1024 / 1024 / 1024",
"legendFormat": "{{instance}} - {{mountpoint}}"
}
]
},
{
"id": 16,
"title": "Load Average (1m, 5m, 15m)",
"type": "graph",
"gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
"targets": [
{
"expr": "node_load1",
"legendFormat": "{{instance}} - 1m"
},
{
"expr": "node_load5",
"legendFormat": "{{instance}} - 5m"
},
{
"expr": "node_load15",
"legendFormat": "{{instance}} - 15m"
}
]
},
{
"id": 17,
"title": "System Up Time",
"type": "stat",
"gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
"targets": [
{
"expr": "node_boot_time_seconds",
"legendFormat": "{{instance}} - uptime"
}
]
},
{
"id": 18,
"title": "Context Switches",
"type": "graph",
"gridPos": {"x": 0, "y": 56, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_context_switches_total[5m])",
"legendFormat": "{{instance}}"
}
]
},
{
"id": 19,
"title": "Interrupts",
"type": "graph",
"gridPos": {"x": 12, "y": 56, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(node_intr_total[5m])",
"legendFormat": "{{instance}}"
}
]
}
]
}
}
alertmanager-dashboard.json: |
{
"dashboard": {
"title": "Bakery IA - AlertManager Monitoring",
"tags": ["bakery-ia", "alertmanager", "alerting"],
"timezone": "browser",
"refresh": "10s",
"schemaVersion": 16,
"version": 1,
"panels": [
{
"id": 1,
"title": "Active Alerts by Severity",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "count by (severity) (ALERTS{alertstate=\"firing\"})",
"legendFormat": "{{severity}}"
}
]
},
{
"id": 2,
"title": "Total Active Alerts",
"type": "stat",
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "count(ALERTS{alertstate=\"firing\"})",
"legendFormat": "Active alerts"
}
]
},
{
"id": 3,
"title": "Critical Alerts",
"type": "stat",
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "count(ALERTS{alertstate=\"firing\", severity=\"critical\"})",
"legendFormat": "Critical"
}
]
},
{
"id": 4,
"title": "Alert Firing Rate (per minute)",
"type": "graph",
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(alertmanager_alerts_fired_total[1m])",
"legendFormat": "Alerts fired/min"
}
]
},
{
"id": 5,
"title": "Alert Resolution Rate (per minute)",
"type": "graph",
"gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(alertmanager_alerts_resolved_total[1m])",
"legendFormat": "Alerts resolved/min"
}
]
},
{
"id": 6,
"title": "Notification Success Rate",
"type": "graph",
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (rate(alertmanager_notifications_total{status=\"success\"}[5m]) / rate(alertmanager_notifications_total[5m]))",
"legendFormat": "Success rate %"
}
]
},
{
"id": 7,
"title": "Notification Failures",
"type": "graph",
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(alertmanager_notifications_total{status=\"failed\"}[5m])",
"legendFormat": "{{integration}}"
}
]
},
{
"id": 8,
"title": "Silenced Alerts",
"type": "stat",
"gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
"targets": [
{
"expr": "count(ALERTS{alertstate=\"silenced\"})",
"legendFormat": "Silenced"
}
]
},
{
"id": 9,
"title": "AlertManager Cluster Size",
"type": "stat",
"gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
"targets": [
{
"expr": "count(alertmanager_cluster_peers)",
"legendFormat": "Cluster peers"
}
]
},
{
"id": 10,
"title": "AlertManager Peers",
"type": "stat",
"gridPos": {"x": 12, "y": 24, "w": 6, "h": 4},
"targets": [
{
"expr": "alertmanager_cluster_peers",
"legendFormat": "{{instance}}"
}
]
},
{
"id": 11,
"title": "Cluster Status",
"type": "stat",
"gridPos": {"x": 18, "y": 24, "w": 6, "h": 4},
"targets": [
{
"expr": "up{job=\"alertmanager\"}",
"legendFormat": "{{instance}}"
}
]
},
{
"id": 12,
"title": "Alerts by Group",
"type": "table",
"gridPos": {"x": 0, "y": 28, "w": 12, "h": 8},
"targets": [
{
"expr": "count by (alertname) (ALERTS{alertstate=\"firing\"})",
"format": "table",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {},
"indexByName": {},
"renameByName": {
"alertname": "Alert Name",
"Value": "Count"
}
}
}
]
},
{
"id": 13,
"title": "Alert Duration (p99)",
"type": "graph",
"gridPos": {"x": 12, "y": 28, "w": 12, "h": 8},
"targets": [
{
"expr": "histogram_quantile(0.99, rate(alertmanager_alert_duration_seconds_bucket[5m]))",
"legendFormat": "p99 duration"
}
]
},
{
"id": 14,
"title": "Processing Time",
"type": "graph",
"gridPos": {"x": 0, "y": 36, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(alertmanager_receiver_processing_duration_seconds_sum[5m]) / rate(alertmanager_receiver_processing_duration_seconds_count[5m])",
"legendFormat": "{{receiver}}"
}
]
},
{
"id": 15,
"title": "Memory Usage",
"type": "stat",
"gridPos": {"x": 12, "y": 36, "w": 12, "h": 8},
"targets": [
{
"expr": "process_resident_memory_bytes{job=\"alertmanager\"} / 1024 / 1024",
"legendFormat": "{{instance}} - MB"
}
]
}
]
}
}
business-metrics-dashboard.json: |
{
"dashboard": {
"title": "Bakery IA - Business Metrics & KPIs",
"tags": ["bakery-ia", "business-metrics", "kpis"],
"timezone": "browser",
"refresh": "30s",
"schemaVersion": 16,
"version": 1,
"panels": [
{
"id": 1,
"title": "Requests per Service (Rate)",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [
{
"expr": "sum by (service) (rate(http_requests_total[5m]))",
"legendFormat": "{{service}}"
}
]
},
{
"id": 2,
"title": "Total Request Rate",
"type": "stat",
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "sum(rate(http_requests_total[5m]))",
"legendFormat": "requests/sec"
}
]
},
{
"id": 3,
"title": "Peak Request Rate (5m)",
"type": "stat",
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
"targets": [
{
"expr": "max(sum(rate(http_requests_total[5m])))",
"legendFormat": "Peak requests/sec"
}
]
},
{
"id": 4,
"title": "Error Rates by Service",
"type": "graph",
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
"targets": [
{
"expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))",
"legendFormat": "{{service}}"
}
]
},
{
"id": 5,
"title": "Overall Error Rate",
"type": "stat",
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
"targets": [
{
"expr": "100 * (sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])))",
"legendFormat": "Error %"
}
]
},
{
"id": 6,
"title": "4xx Error Rate",
"type": "stat",
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
"targets": [
{
"expr": "100 * (sum(rate(http_requests_total{status_code=~\"4..\"}[5m])) / sum(rate(http_requests_total[5m])))",
"legendFormat": "4xx %"
}
]
},
{
"id": 7,
"title": "P95 Latency by Service (ms)",
"type": "graph",
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "histogram_quantile(0.95, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
"legendFormat": "{{service}} p95"
}
]
},
{
"id": 8,
"title": "P99 Latency by Service (ms)",
"type": "graph",
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
"legendFormat": "{{service}} p99"
}
]
},
{
"id": 9,
"title": "Average Latency (ms)",
"type": "stat",
"gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
"targets": [
{
"expr": "(sum(rate(http_request_duration_seconds_sum[5m])) / sum(rate(http_request_duration_seconds_count[5m]))) * 1000",
"legendFormat": "Avg latency ms"
}
]
},
{
"id": 10,
"title": "Active Tenants",
"type": "stat",
"gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
"targets": [
{
"expr": "count(count by (tenant_id) (rate(http_requests_total[5m])))",
"legendFormat": "Active tenants"
}
]
},
{
"id": 11,
"title": "Requests per Tenant",
"type": "stat",
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 4},
"targets": [
{
"expr": "sum by (tenant_id) (rate(http_requests_total[5m]))",
"legendFormat": "Tenant {{tenant_id}}"
}
]
},
{
"id": 12,
"title": "Alert Generation Rate (per minute)",
"type": "graph",
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
"targets": [
{
"expr": "rate(ALERTS_FOR_STATE[1m])",
"legendFormat": "{{alertname}}"
}
]
},
{
"id": 13,
"title": "Training Job Success Rate",
"type": "stat",
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (sum(training_job_completed_total{status=\"success\"}) / sum(training_job_completed_total))",
"legendFormat": "Success rate %"
}
]
},
{
"id": 14,
"title": "Training Jobs in Progress",
"type": "stat",
"gridPos": {"x": 0, "y": 40, "w": 6, "h": 4},
"targets": [
{
"expr": "count(training_job_in_progress)",
"legendFormat": "Jobs running"
}
]
},
{
"id": 15,
"title": "Training Job Completion Time (p95, minutes)",
"type": "stat",
"gridPos": {"x": 6, "y": 40, "w": 6, "h": 4},
"targets": [
{
"expr": "histogram_quantile(0.95, training_job_duration_seconds) / 60",
"legendFormat": "p95 minutes"
}
]
},
{
"id": 16,
"title": "Failed Training Jobs",
"type": "stat",
"gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
"targets": [
{
"expr": "sum(training_job_completed_total{status=\"failed\"})",
"legendFormat": "Failed jobs"
}
]
},
{
"id": 17,
"title": "Total Training Jobs Completed",
"type": "stat",
"gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
"targets": [
{
"expr": "sum(training_job_completed_total)",
"legendFormat": "Total completed"
}
]
},
{
"id": 18,
"title": "API Health Status",
"type": "table",
"gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
"targets": [
{
"expr": "up{job=\"bakery-services\"}",
"format": "table",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {},
"indexByName": {},
"renameByName": {
"service": "Service",
"Value": "Status",
"instance": "Instance"
}
}
}
]
},
{
"id": 19,
"title": "Service Success Rate (%)",
"type": "graph",
"gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
"targets": [
{
"expr": "100 * (1 - (sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum by (service) (rate(http_requests_total[5m]))))",
"legendFormat": "{{service}}"
}
]
},
{
"id": 20,
"title": "Requests Processed Today",
"type": "stat",
"gridPos": {"x": 0, "y": 56, "w": 12, "h": 4},
"targets": [
{
"expr": "sum(increase(http_requests_total[24h]))",
"legendFormat": "Requests (24h)"
}
]
},
{
"id": 21,
"title": "Distinct Users Today",
"type": "stat",
"gridPos": {"x": 12, "y": 56, "w": 12, "h": 4},
"targets": [
{
"expr": "count(count by (user_id) (increase(http_requests_total{user_id!=\"\"}[24h])))",
"legendFormat": "Users (24h)"
}
]
}
]
}
}

View File

@@ -1,177 +0,0 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards
namespace: monitoring
data:
gateway-metrics.json: |
{
"dashboard": {
"title": "Bakery IA - Gateway Metrics",
"tags": ["bakery-ia", "gateway"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Request Rate by Endpoint",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [{
"expr": "rate(http_requests_total{service=\"gateway\"}[5m])",
"legendFormat": "{{method}} {{endpoint}}"
}]
},
{
"id": 2,
"title": "P95 Request Latency",
"type": "graph",
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
"targets": [{
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"gateway\"}[5m]))",
"legendFormat": "{{endpoint}} p95"
}]
},
{
"id": 3,
"title": "Error Rate (5xx)",
"type": "graph",
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
"targets": [{
"expr": "rate(http_requests_total{service=\"gateway\",status_code=~\"5..\"}[5m])",
"legendFormat": "{{endpoint}} errors"
}]
},
{
"id": 4,
"title": "Active Requests",
"type": "stat",
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
"targets": [{
"expr": "sum(rate(http_requests_total{service=\"gateway\"}[1m]))"
}]
},
{
"id": 5,
"title": "Authentication Success Rate",
"type": "stat",
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
"targets": [{
"expr": "rate(gateway_auth_responses_total[5m]) / rate(gateway_auth_requests_total[5m]) * 100"
}]
}
],
"refresh": "10s",
"schemaVersion": 16,
"version": 1
}
}
services-overview.json: |
{
"dashboard": {
"title": "Bakery IA - Services Overview",
"tags": ["bakery-ia", "services"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Request Rate by Service",
"type": "graph",
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
"targets": [{
"expr": "sum by (service) (rate(http_requests_total[5m]))",
"legendFormat": "{{service}}"
}]
},
{
"id": 2,
"title": "P99 Latency by Service",
"type": "graph",
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
"targets": [{
"expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m])))",
"legendFormat": "{{service}} p99"
}]
},
{
"id": 3,
"title": "Error Rate by Service",
"type": "graph",
"gridPos": {"x": 0, "y": 8, "w": 24, "h": 8},
"targets": [{
"expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))",
"legendFormat": "{{service}}"
}]
},
{
"id": 4,
"title": "Service Health Status",
"type": "table",
"gridPos": {"x": 0, "y": 16, "w": 24, "h": 8},
"targets": [{
"expr": "up{job=\"bakery-services\"}",
"format": "table",
"instant": true
}],
"transformations": [{
"id": "organize",
"options": {
"excludeByName": {},
"indexByName": {},
"renameByName": {
"service": "Service Name",
"Value": "Status"
}
}
}]
}
],
"refresh": "30s",
"schemaVersion": 16,
"version": 1
}
}
circuit-breakers.json: |
{
"dashboard": {
"title": "Bakery IA - Circuit Breakers",
"tags": ["bakery-ia", "reliability"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Circuit Breaker States",
"type": "stat",
"gridPos": {"x": 0, "y": 0, "w": 24, "h": 4},
"targets": [{
"expr": "circuit_breaker_state",
"legendFormat": "{{service}} - {{state}}"
}]
},
{
"id": 2,
"title": "Circuit Breaker Trips",
"type": "graph",
"gridPos": {"x": 0, "y": 4, "w": 12, "h": 8},
"targets": [{
"expr": "rate(circuit_breaker_opened_total[5m])",
"legendFormat": "{{service}}"
}]
},
{
"id": 3,
"title": "Rejected Requests",
"type": "graph",
"gridPos": {"x": 12, "y": 4, "w": 12, "h": 8},
"targets": [{
"expr": "rate(circuit_breaker_rejected_total[5m])",
"legendFormat": "{{service}}"
}]
}
],
"refresh": "10s",
"schemaVersion": 16,
"version": 1
}
}

View File

@@ -1,166 +0,0 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-datasources
namespace: monitoring
data:
prometheus.yaml: |
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards-config
namespace: monitoring
data:
dashboards.yaml: |
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: 'Bakery IA'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
- name: 'extended'
orgId: 1
folder: 'Bakery IA - Extended'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards-extended
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: monitoring
labels:
app: grafana
spec:
replicas: 1
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
containers:
- name: grafana
image: grafana/grafana:12.3.0
ports:
- containerPort: 3000
name: http
env:
- name: GF_SECURITY_ADMIN_USER
valueFrom:
secretKeyRef:
name: grafana-admin
key: admin-user
- name: GF_SECURITY_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-admin
key: admin-password
- name: GF_SERVER_ROOT_URL
value: "http://monitoring.bakery-ia.local/grafana"
- name: GF_SERVER_SERVE_FROM_SUB_PATH
value: "true"
- name: GF_AUTH_ANONYMOUS_ENABLED
value: "false"
- name: GF_INSTALL_PLUGINS
value: ""
volumeMounts:
- name: grafana-storage
mountPath: /var/lib/grafana
- name: grafana-datasources
mountPath: /etc/grafana/provisioning/datasources
- name: grafana-dashboards-config
mountPath: /etc/grafana/provisioning/dashboards
- name: grafana-dashboards
mountPath: /var/lib/grafana/dashboards
- name: grafana-dashboards-extended
mountPath: /var/lib/grafana/dashboards-extended
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "500m"
livenessProbe:
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: grafana-storage
persistentVolumeClaim:
claimName: grafana-storage
- name: grafana-datasources
configMap:
name: grafana-datasources
- name: grafana-dashboards-config
configMap:
name: grafana-dashboards-config
- name: grafana-dashboards
configMap:
name: grafana-dashboards
- name: grafana-dashboards-extended
configMap:
name: grafana-dashboards-extended
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-storage
namespace: monitoring
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 5Gi
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: monitoring
labels:
app: grafana
spec:
type: ClusterIP
ports:
- port: 3000
targetPort: 3000
protocol: TCP
name: http
selector:
app: grafana

View File

@@ -1,100 +0,0 @@
---
# PodDisruptionBudgets ensure minimum availability during voluntary disruptions
# (node drains, rolling updates, etc.)
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: prometheus-pdb
namespace: monitoring
spec:
minAvailable: 1
selector:
matchLabels:
app: prometheus
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: alertmanager-pdb
namespace: monitoring
spec:
minAvailable: 2
selector:
matchLabels:
app: alertmanager
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: grafana-pdb
namespace: monitoring
spec:
minAvailable: 1
selector:
matchLabels:
app: grafana
---
# ResourceQuota limits total resources in monitoring namespace
apiVersion: v1
kind: ResourceQuota
metadata:
name: monitoring-quota
namespace: monitoring
spec:
hard:
# Compute resources
requests.cpu: "10"
requests.memory: "16Gi"
limits.cpu: "20"
limits.memory: "32Gi"
# Storage
persistentvolumeclaims: "10"
requests.storage: "100Gi"
# Object counts
pods: "50"
services: "20"
configmaps: "30"
secrets: "20"
---
# LimitRange sets default resource limits for pods in monitoring namespace
apiVersion: v1
kind: LimitRange
metadata:
name: monitoring-limits
namespace: monitoring
spec:
limits:
# Default container limits
- max:
cpu: "2"
memory: "4Gi"
min:
cpu: "10m"
memory: "16Mi"
default:
cpu: "500m"
memory: "512Mi"
defaultRequest:
cpu: "100m"
memory: "128Mi"
type: Container
# Pod limits
- max:
cpu: "4"
memory: "8Gi"
type: Pod
# PVC limits
- max:
storage: "50Gi"
min:
storage: "1Gi"
type: PersistentVolumeClaim

View File

@@ -1,42 +0,0 @@
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: monitoring-ingress
namespace: monitoring
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /$2
nginx.ingress.kubernetes.io/ssl-redirect: "false"
spec:
rules:
- host: monitoring.bakery-ia.local
http:
paths:
- path: /grafana(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: grafana
port:
number: 3000
- path: /prometheus(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: prometheus-external
port:
number: 9090
- path: /jaeger(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: jaeger-query
port:
number: 16686
- path: /alertmanager(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: alertmanager-external
port:
number: 9093

View File

@@ -1,190 +0,0 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: jaeger
namespace: monitoring
labels:
app: jaeger
spec:
replicas: 1
selector:
matchLabels:
app: jaeger
template:
metadata:
labels:
app: jaeger
spec:
containers:
- name: jaeger
image: jaegertracing/all-in-one:1.51
env:
- name: COLLECTOR_ZIPKIN_HOST_PORT
value: ":9411"
- name: COLLECTOR_OTLP_ENABLED
value: "true"
- name: SPAN_STORAGE_TYPE
value: "badger"
- name: BADGER_EPHEMERAL
value: "false"
- name: BADGER_DIRECTORY_VALUE
value: "/badger/data"
- name: BADGER_DIRECTORY_KEY
value: "/badger/key"
ports:
- containerPort: 5775
protocol: UDP
name: zipkin-compact
- containerPort: 6831
protocol: UDP
name: jaeger-compact
- containerPort: 6832
protocol: UDP
name: jaeger-binary
- containerPort: 5778
protocol: TCP
name: config-rest
- containerPort: 16686
protocol: TCP
name: query
- containerPort: 14250
protocol: TCP
name: grpc
- containerPort: 14268
protocol: TCP
name: c-tchan-trft
- containerPort: 14269
protocol: TCP
name: admin-http
- containerPort: 9411
protocol: TCP
name: zipkin
- containerPort: 4317
protocol: TCP
name: otlp-grpc
- containerPort: 4318
protocol: TCP
name: otlp-http
volumeMounts:
- name: jaeger-storage
mountPath: /badger
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
livenessProbe:
httpGet:
path: /
port: 14269
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /
port: 14269
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: jaeger-storage
persistentVolumeClaim:
claimName: jaeger-storage
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: jaeger-storage
namespace: monitoring
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
---
apiVersion: v1
kind: Service
metadata:
name: jaeger-query
namespace: monitoring
labels:
app: jaeger
spec:
type: ClusterIP
ports:
- port: 16686
targetPort: 16686
protocol: TCP
name: query
selector:
app: jaeger
---
apiVersion: v1
kind: Service
metadata:
name: jaeger-collector
namespace: monitoring
labels:
app: jaeger
spec:
type: ClusterIP
ports:
- port: 14268
targetPort: 14268
protocol: TCP
name: c-tchan-trft
- port: 14250
targetPort: 14250
protocol: TCP
name: grpc
- port: 9411
targetPort: 9411
protocol: TCP
name: zipkin
- port: 4317
targetPort: 4317
protocol: TCP
name: otlp-grpc
- port: 4318
targetPort: 4318
protocol: TCP
name: otlp-http
selector:
app: jaeger
---
apiVersion: v1
kind: Service
metadata:
name: jaeger-agent
namespace: monitoring
labels:
app: jaeger
spec:
type: ClusterIP
clusterIP: None
ports:
- port: 5775
targetPort: 5775
protocol: UDP
name: zipkin-compact
- port: 6831
targetPort: 6831
protocol: UDP
name: jaeger-compact
- port: 6832
targetPort: 6832
protocol: UDP
name: jaeger-binary
- port: 5778
targetPort: 5778
protocol: TCP
name: config-rest
selector:
app: jaeger

View File

@@ -1,18 +1,20 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
# Minimal Monitoring Infrastructure
# SigNoz is now managed via Helm in the 'signoz' namespace
# This kustomization only maintains:
# - Namespace for legacy resources (if needed)
# - Node exporter for infrastructure metrics
# - PostgreSQL exporter for database metrics
# - Optional OTEL collector (can be disabled if using SigNoz's built-in collector)
resources:
- namespace.yaml
- secrets.yaml
- prometheus.yaml
- alert-rules.yaml
- alertmanager.yaml
- alertmanager-init.yaml
- grafana.yaml
- grafana-dashboards.yaml
- grafana-dashboards-extended.yaml
- postgres-exporter.yaml
# Exporters for metrics collection
- node-exporter.yaml
- jaeger.yaml
- ha-policies.yaml
- ingress.yaml
- postgres-exporter.yaml
# Optional: Keep OTEL collector or use SigNoz's built-in one
# Uncomment if you want a dedicated OTEL collector in monitoring namespace
# - otel-collector.yaml

View File

@@ -0,0 +1,167 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: otel-collector-config
namespace: monitoring
data:
otel-collector-config.yaml: |
extensions:
health_check:
endpoint: 0.0.0.0:13133
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 10s
send_batch_size: 1024
# Memory limiter to prevent OOM
memory_limiter:
check_interval: 1s
limit_mib: 512
spike_limit_mib: 128
exporters:
# Export metrics to Prometheus
prometheus:
endpoint: "0.0.0.0:8889"
namespace: otelcol
const_labels:
source: otel-collector
# Export to SigNoz
otlp/signoz:
endpoint: "signoz-query-service.monitoring.svc.cluster.local:8080"
tls:
insecure: true
# Logging exporter for debugging traces and logs
logging:
loglevel: info
sampling_initial: 5
sampling_thereafter: 200
service:
extensions: [health_check]
pipelines:
# Traces pipeline: receive -> process -> export to SigNoz
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/signoz, logging]
# Metrics pipeline: receive -> process -> export to both Prometheus and SigNoz
metrics:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [prometheus, otlp/signoz]
# Logs pipeline: receive -> process -> export to SigNoz
logs:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/signoz, logging]
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: otel-collector
namespace: monitoring
labels:
app: otel-collector
spec:
replicas: 1
selector:
matchLabels:
app: otel-collector
template:
metadata:
labels:
app: otel-collector
spec:
containers:
- name: otel-collector
image: otel/opentelemetry-collector-contrib:0.91.0
args:
- --config=/conf/otel-collector-config.yaml
ports:
- containerPort: 4317
protocol: TCP
name: otlp-grpc
- containerPort: 4318
protocol: TCP
name: otlp-http
- containerPort: 8889
protocol: TCP
name: prometheus
- containerPort: 13133
protocol: TCP
name: health-check
volumeMounts:
- name: otel-collector-config
mountPath: /conf
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "500m"
livenessProbe:
httpGet:
path: /
port: 13133
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /
port: 13133
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: otel-collector-config
configMap:
name: otel-collector-config
items:
- key: otel-collector-config.yaml
path: otel-collector-config.yaml
---
apiVersion: v1
kind: Service
metadata:
name: otel-collector
namespace: monitoring
labels:
app: otel-collector
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8889"
prometheus.io/path: "/metrics"
spec:
type: ClusterIP
ports:
- port: 4317
targetPort: 4317
protocol: TCP
name: otlp-grpc
- port: 4318
targetPort: 4318
protocol: TCP
name: otlp-http
- port: 8889
targetPort: 8889
protocol: TCP
name: prometheus
selector:
app: otel-collector

View File

@@ -1,278 +0,0 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 30s
evaluation_interval: 30s
external_labels:
cluster: 'bakery-ia'
environment: 'production'
# AlertManager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093
- alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093
- alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093
# Load alert rules
rule_files:
- '/etc/prometheus/rules/*.yml'
scrape_configs:
# Scrape Prometheus itself
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Scrape all bakery-ia services
- job_name: 'bakery-services'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- bakery-ia
relabel_configs:
# Only scrape pods with metrics port
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: http
# Add service name label
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
target_label: service
# Add component label
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
target_label: component
# Add pod name
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
# Set metrics path
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
# Set scrape port
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
# Scrape Kubernetes nodes
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
# Scrape AlertManager
- job_name: 'alertmanager'
static_configs:
- targets:
- alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093
- alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093
- alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093
# Scrape PostgreSQL exporter
- job_name: 'postgres-exporter'
static_configs:
- targets: ['postgres-exporter.monitoring.svc.cluster.local:9187']
# Scrape Node Exporter
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
- source_labels: [__meta_kubernetes_node_name]
target_label: node
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
serviceName: prometheus
replicas: 2
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- prometheus
topologyKey: kubernetes.io/hostname
containers:
- name: prometheus
image: prom/prometheus:v3.0.1
args:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--web.enable-lifecycle'
ports:
- containerPort: 9090
name: web
volumeMounts:
- name: prometheus-config
mountPath: /etc/prometheus
- name: prometheus-rules
mountPath: /etc/prometheus/rules
- name: prometheus-storage
mountPath: /prometheus
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1"
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
- name: prometheus-rules
configMap:
name: prometheus-alert-rules
volumeClaimTemplates:
- metadata:
name: prometheus-storage
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 20Gi
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitoring
labels:
app: prometheus
spec:
type: ClusterIP
clusterIP: None
ports:
- port: 9090
targetPort: 9090
protocol: TCP
name: web
selector:
app: prometheus
---
apiVersion: v1
kind: Service
metadata:
name: prometheus-external
namespace: monitoring
labels:
app: prometheus
spec:
type: ClusterIP
ports:
- port: 9090
targetPort: 9090
protocol: TCP
name: web
selector:
app: prometheus

View File

@@ -14,9 +14,10 @@ data:
DEBUG: "false"
LOG_LEVEL: "INFO"
# Observability Settings
# Set to "true" when Jaeger/monitoring stack is deployed
ENABLE_TRACING: "false"
# Observability Settings - SigNoz enabled
ENABLE_TRACING: "true"
ENABLE_METRICS: "true"
ENABLE_LOGS: "true"
# Database initialization settings
# IMPORTANT: Services NEVER run migrations - they only verify DB is ready
@@ -286,12 +287,11 @@ data:
LOG_FILE_PATH: "/app/logs"
LOG_ROTATION_SIZE: "100MB"
LOG_RETENTION_DAYS: "30"
PROMETHEUS_ENABLED: "true"
PROMETHEUS_RETENTION: "200h"
HEALTH_CHECK_TIMEOUT: "30"
HEALTH_CHECK_INTERVAL: "30"
PROMETHEUS_RETENTION_DAYS: "30"
GRAFANA_ROOT_URL: "http://monitoring.bakery-ia.local/grafana"
# Monitoring Configuration - SigNoz
SIGNOZ_ROOT_URL: "http://localhost/signoz"
# ================================================================
# DATA COLLECTION SETTINGS
@@ -382,16 +382,20 @@ data:
NOMINATIM_CPU_LIMIT: "4"
# ================================================================
# DISTRIBUTED TRACING (Jaeger/OpenTelemetry)
# OBSERVABILITY - SigNoz (Unified Monitoring)
# ================================================================
JAEGER_COLLECTOR_ENDPOINT: "http://jaeger-collector.monitoring:4317"
JAEGER_AGENT_HOST: "jaeger-agent.monitoring"
JAEGER_AGENT_PORT: "6831"
OTEL_EXPORTER_OTLP_ENDPOINT: "http://jaeger-collector.monitoring:4317"
# OpenTelemetry Configuration - Direct to SigNoz
OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.signoz.svc.cluster.local:4317"
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
OTEL_SERVICE_NAME: "bakery-ia"
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
# SigNoz Endpoints
SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "http://signoz-frontend.signoz.svc.cluster.local:3301"
# ================================================================
# REPLENISHMENT PLANNING SETTINGS
# REPLENISHMENT PLANNING SETTINGS
# ================================================================
REPLENISHMENT_PROJECTION_HORIZON_DAYS: "7"
REPLENISHMENT_SERVICE_LEVEL: "0.95"

View File

@@ -9,11 +9,14 @@ metadata:
resources:
- ../../base
# Monitoring disabled for dev to save resources
# - ../../base/components/monitoring
# Monitoring enabled for dev environment
- ../../base/components/monitoring
- dev-ingress.yaml
# SigNoz ingress is applied by Tilt (see Tiltfile)
# - signoz-ingress.yaml
# Dev-Prod Parity: Enable HTTPS with self-signed certificates
- dev-certificate.yaml
- monitoring-certificate.yaml
- cluster-issuer-staging.yaml
# Exclude nominatim from dev to save resources
@@ -608,6 +611,39 @@ patches:
limits:
memory: "512Mi"
cpu: "300m"
# Optional exporters resource patches for dev
- target:
group: apps
version: v1
kind: DaemonSet
name: node-exporter
namespace: monitoring
patch: |-
- op: replace
path: /spec/template/spec/containers/0/resources
value:
requests:
memory: "32Mi"
cpu: "25m"
limits:
memory: "64Mi"
cpu: "100m"
- target:
group: apps
version: v1
kind: Deployment
name: postgres-exporter
namespace: monitoring
patch: |-
- op: replace
path: /spec/template/spec/containers/0/resources
value:
requests:
memory: "32Mi"
cpu: "25m"
limits:
memory: "64Mi"
cpu: "100m"
secretGenerator:
- name: dev-secrets

View File

@@ -0,0 +1,49 @@
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: bakery-dev-monitoring-tls-cert
namespace: monitoring
spec:
# Self-signed certificate for local development
secretName: bakery-ia-tls-cert
# Certificate duration
duration: 2160h # 90 days
renewBefore: 360h # 15 days
# Subject configuration
subject:
organizations:
- Bakery IA Development
# Common name
commonName: localhost
# DNS names this certificate is valid for
dnsNames:
- localhost
- monitoring.bakery-ia.local
# IP addresses (for localhost)
ipAddresses:
- 127.0.0.1
- ::1
# Use self-signed issuer for development
issuerRef:
name: selfsigned-issuer
kind: ClusterIssuer
group: cert-manager.io
# Private key configuration
privateKey:
algorithm: RSA
encoding: PKCS1
size: 2048
# Usages
usages:
- server auth
- client auth
- digital signature
- key encipherment

View File

@@ -0,0 +1,39 @@
---
# SigNoz Ingress for Development (localhost)
# SigNoz is deployed via Helm in the 'signoz' namespace
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: signoz-ingress-localhost
namespace: signoz
annotations:
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
nginx.ingress.kubernetes.io/rewrite-target: /$2
nginx.ingress.kubernetes.io/use-regex: "true"
spec:
ingressClassName: nginx
tls:
- hosts:
- localhost
secretName: bakery-ia-tls-cert
rules:
- host: localhost
http:
paths:
# SigNoz Frontend UI
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz-frontend
port:
number: 3301
# SigNoz Query Service API
- path: /signoz-api(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz-query-service
port:
number: 8080

View File

@@ -14,6 +14,7 @@ resources:
patchesStrategicMerge:
- storage-patch.yaml
- monitoring-ingress-patch.yaml
labels:
- includeSelectors: true
@@ -21,6 +22,89 @@ labels:
environment: production
tier: production
# SigNoz resource patches for production
patches:
# SigNoz ClickHouse production configuration
- target:
group: apps
version: v1
kind: StatefulSet
name: signoz-clickhouse
namespace: signoz
patch: |-
- op: replace
path: /spec/replicas
value: 2
- op: replace
path: /spec/template/spec/containers/0/resources
value:
requests:
memory: "2Gi"
cpu: "500m"
limits:
memory: "4Gi"
cpu: "1000m"
# SigNoz Query Service production configuration
- target:
group: apps
version: v1
kind: Deployment
name: signoz-query-service
namespace: signoz
patch: |-
- op: replace
path: /spec/replicas
value: 2
- op: replace
path: /spec/template/spec/containers/0/resources
value:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
# SigNoz AlertManager production configuration
- target:
group: apps
version: v1
kind: Deployment
name: signoz-alertmanager
namespace: signoz
patch: |-
- op: replace
path: /spec/replicas
value: 2
- op: replace
path: /spec/template/spec/containers/0/resources
value:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
# SigNoz Frontend production configuration
- target:
group: apps
version: v1
kind: Deployment
name: signoz-frontend
namespace: signoz
patch: |-
- op: replace
path: /spec/replicas
value: 2
- op: replace
path: /spec/template/spec/containers/0/resources
value:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
images:
- name: bakery/auth-service
newTag: latest

View File

@@ -17,14 +17,30 @@ data:
REQUEST_TIMEOUT: "30"
MAX_CONNECTIONS: "100"
# Monitoring
PROMETHEUS_ENABLED: "true"
# Monitoring - SigNoz (Unified Observability)
ENABLE_TRACING: "true"
ENABLE_METRICS: "true"
JAEGER_ENABLED: "true"
JAEGER_AGENT_HOST: "jaeger-agent.monitoring.svc.cluster.local"
JAEGER_AGENT_PORT: "6831"
ENABLE_LOGS: "true"
# OpenTelemetry Configuration - Direct to SigNoz
OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.signoz.svc.cluster.local:4317"
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
OTEL_SERVICE_NAME: "bakery-ia"
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=production,cluster.name=bakery-ia-prod"
# SigNoz Endpoints
SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "https://monitoring.bakewise.ai/signoz"
SIGNOZ_ROOT_URL: "https://monitoring.bakewise.ai/signoz"
# Rate Limiting (stricter in production)
RATE_LIMIT_ENABLED: "true"
RATE_LIMIT_PER_MINUTE: "60"
# CORS Configuration for Production
CORS_ORIGINS: "https://bakewise.ai"
CORS_ALLOW_CREDENTIALS: "true"
# Frontend Configuration
VITE_API_URL: "/api"
VITE_ENVIRONMENT: "production"

View File

@@ -16,7 +16,7 @@ metadata:
# CORS configuration for production
nginx.ingress.kubernetes.io/enable-cors: "true"
nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakery.yourdomain.com,https://api.yourdomain.com"
nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakewise.ai"
nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS, PATCH"
nginx.ingress.kubernetes.io/cors-allow-headers: "Content-Type, Authorization, X-Requested-With, Accept, Origin"
nginx.ingress.kubernetes.io/cors-allow-credentials: "true"
@@ -40,12 +40,10 @@ spec:
ingressClassName: nginx
tls:
- hosts:
- bakery.yourdomain.com
- api.yourdomain.com
- monitoring.yourdomain.com
- bakewise.ai
secretName: bakery-ia-prod-tls-cert
rules:
- host: bakery.yourdomain.com
- host: bakewise.ai
http:
paths:
- path: /
@@ -55,7 +53,7 @@ spec:
name: frontend-service
port:
number: 3000
- path: /api
- path: /api/v1
pathType: Prefix
backend:
service:
@@ -63,31 +61,4 @@ spec:
port:
number: 8000
- host: api.yourdomain.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: gateway-service
port:
number: 8000
- host: monitoring.yourdomain.com
http:
paths:
- path: /grafana
pathType: Prefix
backend:
service:
name: grafana-service
port:
number: 3000
- path: /prometheus
pathType: Prefix
backend:
service:
name: prometheus-service
port:
number: 9090
# Monitoring (monitoring.bakewise.ai) is now handled by signoz-ingress.yaml in the signoz namespace

View File

@@ -0,0 +1,78 @@
---
# SigNoz Ingress for Production
# SigNoz is deployed via Helm in the 'signoz' namespace
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: signoz-ingress-prod
namespace: signoz
labels:
app.kubernetes.io/name: signoz
app.kubernetes.io/component: ingress
annotations:
# Nginx ingress controller annotations
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
nginx.ingress.kubernetes.io/proxy-body-size: "50m"
nginx.ingress.kubernetes.io/proxy-connect-timeout: "600"
nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
nginx.ingress.kubernetes.io/rewrite-target: /$2
nginx.ingress.kubernetes.io/use-regex: "true"
# CORS configuration
nginx.ingress.kubernetes.io/enable-cors: "true"
nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakewise.ai,https://monitoring.bakewise.ai"
nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS, PATCH"
nginx.ingress.kubernetes.io/cors-allow-headers: "Content-Type, Authorization, X-Requested-With, Accept, Origin"
nginx.ingress.kubernetes.io/cors-allow-credentials: "true"
# Security headers
nginx.ingress.kubernetes.io/configuration-snippet: |
more_set_headers "X-Frame-Options: SAMEORIGIN";
more_set_headers "X-Content-Type-Options: nosniff";
more_set_headers "X-XSS-Protection: 1; mode=block";
more_set_headers "Referrer-Policy: strict-origin-when-cross-origin";
# Rate limiting
nginx.ingress.kubernetes.io/limit-rps: "100"
nginx.ingress.kubernetes.io/limit-connections: "50"
# Cert-manager annotations for automatic certificate issuance
cert-manager.io/cluster-issuer: "letsencrypt-production"
cert-manager.io/acme-challenge-type: http01
spec:
ingressClassName: nginx
tls:
- hosts:
- monitoring.bakewise.ai
secretName: signoz-prod-tls-cert
rules:
- host: monitoring.bakewise.ai
http:
paths:
# SigNoz Frontend UI
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz-frontend
port:
number: 3301
# SigNoz Query Service API
- path: /signoz-api(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz-query-service
port:
number: 8080
# SigNoz AlertManager
- path: /signoz-alerts(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz-alertmanager
port:
number: 9093

View File

@@ -0,0 +1,79 @@
# SigNoz Helm Chart Values - Customized for Bakery IA
# https://github.com/SigNoz/charts
# Global settings
global:
storageClass: "standard"
# Frontend configuration
frontend:
service:
type: ClusterIP
port: 3301
ingress:
enabled: true
hosts:
- host: localhost
paths:
- path: /signoz
pathType: Prefix
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /$2
# Query Service configuration
queryService:
replicaCount: 1
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 200m
memory: 512Mi
# AlertManager configuration
alertmanager:
replicaCount: 1
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 100m
memory: 256Mi
# ClickHouse configuration
clickhouse:
persistence:
enabled: true
size: 10Gi
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 1000m
memory: 2Gi
# OpenTelemetry Collector configuration
otelCollector:
enabled: true
config:
exporters:
otlp:
endpoint: "signoz-query-service:8080"
service:
pipelines:
traces:
receivers: [otlp]
exporters: [otlp]
metrics:
receivers: [otlp]
exporters: [otlp]
logs:
receivers: [otlp]
exporters: [otlp]
# Resource optimization for development
# These can be increased for production
development: true

View File

@@ -228,6 +228,12 @@ setup() {
if [ $? -eq 0 ]; then
print_success "Colima started successfully"
# Increase inotify limits for Colima to prevent "too many open files" errors
print_status "Increasing inotify limits in Colima VM..."
colima ssh --profile k8s-local "sudo sysctl -w fs.inotify.max_user_watches=524288"
colima ssh --profile k8s-local "sudo sysctl -w fs.inotify.max_user_instances=512"
print_success "Inotify limits increased"
else
print_error "Failed to start Colima"
exit 1
@@ -262,7 +268,7 @@ setup() {
# 4. Connect registry to Kind network
connect_registry_to_kind
# 3. Install NGINX Ingress Controller
# 5. Install NGINX Ingress Controller
print_status "Installing NGINX Ingress Controller..."
# Apply the ingress-nginx manifest
@@ -286,29 +292,9 @@ setup() {
exit 1
fi
# 4. Configure permanent localhost access
print_status "Configuring localhost access via NodePort..."
print_success "NGINX Ingress Controller ready (using Kind's built-in NodePort configuration)"
# Check if service exists
if kubectl get svc ingress-nginx-controller -n ingress-nginx &>/dev/null; then
# Patch the service to expose NodePorts
kubectl patch svc ingress-nginx-controller \
-n ingress-nginx \
--type merge \
-p '{"spec":{"type":"NodePort","ports":[{"name":"http","port":80,"targetPort":"http","nodePort":30080},{"name":"https","port":443,"targetPort":"https","nodePort":30443}]}}'
if [ $? -eq 0 ]; then
print_success "NodePort configuration applied"
else
print_error "Failed to patch Ingress service"
exit 1
fi
else
print_error "Ingress NGINX controller service not found"
exit 1
fi
# 5. Verify port mappings from kind-config.yaml
# 6. Verify port mappings from kind-config.yaml
print_status "Verifying port mappings from configuration..."
# Extract ports from kind-config.yaml
@@ -323,24 +309,24 @@ setup() {
echo " - Colima profile: k8s-local"
echo " - Kind cluster: $CLUSTER_NAME"
echo " - Local registry: localhost:5001"
echo " - Direct port mappings (from kind-config.yaml):"
echo " Frontend: localhost:3000 -> container:30300"
echo " Gateway: localhost:8000 -> container:30800"
echo " - Ingress access:"
echo " HTTP: localhost:${HTTP_HOST_PORT} -> ingress:30080"
echo " HTTPS: localhost:${HTTPS_HOST_PORT} -> ingress:30443"
echo " - NodePort access:"
echo " HTTP: localhost:30080"
echo " HTTPS: localhost:30443"
echo "----------------------------------------"
print_status "To access your applications:"
echo " - Use Ingress via: http://localhost:${HTTP_HOST_PORT}"
echo " - Direct NodePort: http://localhost:30080"
echo ""
print_status "Port Mappings (configured in kind-config.yaml):"
echo " - HTTP Ingress: localhost:${HTTP_HOST_PORT} -> Kind NodePort 30080"
echo " - HTTPS Ingress: localhost:${HTTPS_HOST_PORT} -> Kind NodePort 30443"
echo " - Frontend Direct: localhost:3000 -> container:30300"
echo " - Gateway Direct: localhost:8000 -> container:30800"
echo ""
print_status "How to access your application:"
echo " 1. Start Tilt: tilt up"
echo " 2. Access via:"
echo " - Ingress: http://localhost (or https://localhost)"
echo " - Direct: http://localhost:3000 (frontend), http://localhost:8000 (gateway)"
echo " - Tilt UI: http://localhost:10350"
echo "----------------------------------------"
print_status "Local Registry Information:"
echo " - Registry URL: localhost:5001"
echo " - Images will be pushed to: localhost:5001/bakery/<service>"
echo " - Update your Tiltfile with: default_registry('localhost:5001')"
echo " - Images pushed to: localhost:5001/bakery/<service>"
echo " - Tiltfile already configured: default_registry('localhost:5001')"
echo "----------------------------------------"
}

View File

@@ -1,22 +1,50 @@
"""Main FastAPI application for AI Insights Service."""
from fastapi import FastAPI
from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import structlog
import os
from app.core.config import settings
from app.core.database import init_db, close_db
from app.api import insights
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
# Configure structured logging
structlog.configure(
processors=[
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.JSONRenderer()
]
)
# OpenTelemetry imports
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "ai-insights"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"),
insecure=True
)
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("ai-insights")
# Setup logging
setup_logging("ai-insights", getattr(settings, 'LOG_LEVEL', 'INFO'))
logger = structlog.get_logger()
@@ -28,6 +56,10 @@ async def lifespan(app: FastAPI):
await init_db()
logger.info("Database initialized")
# Start metrics server
metrics_collector.start_metrics_server(8080)
logger.info("Metrics server started on port 8080")
yield
# Shutdown
@@ -44,6 +76,24 @@ app = FastAPI(
lifespan=lifespan
)
# Instrument FastAPI with OpenTelemetry
FastAPIInstrumentor.instrument_app(app)
# Instrument httpx for outgoing requests
HTTPXClientInstrumentor().instrument()
# Instrument Redis
RedisInstrumentor().instrument()
# Instrument SQLAlchemy
SQLAlchemyInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("ai-insights")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# CORS middleware
app.add_middleware(
CORSMiddleware,
@@ -81,6 +131,15 @@ async def health_check():
}
@app.get("/metrics")
async def metrics():
"""Prometheus metrics endpoint"""
return Response(
content=metrics_collector.get_metrics(),
media_type="text/plain; version=0.0.4; charset=utf-8"
)
if __name__ == "__main__":
import uvicorn

View File

@@ -29,6 +29,16 @@ pytz==2023.3
# Logging
structlog==23.2.0
# Monitoring and Observability
prometheus-client==0.23.1
opentelemetry-api==1.27.0
opentelemetry-sdk==1.27.0
opentelemetry-instrumentation-fastapi==0.48b0
opentelemetry-exporter-otlp-proto-grpc==1.27.0
opentelemetry-instrumentation-httpx==0.48b0
opentelemetry-instrumentation-redis==0.48b0
opentelemetry-instrumentation-sqlalchemy==0.48b0
# Machine Learning (for confidence scoring and impact estimation)
numpy==1.26.2
pandas==2.1.3

View File

@@ -4,25 +4,52 @@ Alert Processor Service v2.0
Main FastAPI application with RabbitMQ consumer lifecycle management.
"""
from fastapi import FastAPI
from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import structlog
import os
from app.core.config import settings
from app.consumer.event_consumer import EventConsumer
from app.api import alerts, sse
from shared.redis_utils import initialize_redis, close_redis
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
# Configure structured logging
structlog.configure(
processors=[
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.add_log_level,
structlog.processors.JSONRenderer()
]
)
# OpenTelemetry imports
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "alert-processor"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"),
insecure=True
)
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("alert-processor")
# Setup logging
setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO'))
logger = structlog.get_logger()
# Global consumer instance
@@ -54,6 +81,10 @@ async def lifespan(app: FastAPI):
consumer = EventConsumer()
await consumer.start()
logger.info("alert_processor_started")
# Start metrics server
metrics_collector.start_metrics_server(8080)
logger.info("Metrics server started on port 8080")
except Exception as e:
logger.error("alert_processor_startup_failed", error=str(e))
raise
@@ -79,6 +110,24 @@ app = FastAPI(
debug=settings.DEBUG
)
# Instrument FastAPI with OpenTelemetry
FastAPIInstrumentor.instrument_app(app)
# Instrument httpx for outgoing requests
HTTPXClientInstrumentor().instrument()
# Instrument Redis
RedisInstrumentor().instrument()
# Instrument SQLAlchemy
SQLAlchemyInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("alert-processor")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# CORS middleware
app.add_middleware(
CORSMiddleware,
@@ -126,6 +175,15 @@ async def root():
}
@app.get("/metrics")
async def metrics():
"""Prometheus metrics endpoint"""
return Response(
content=metrics_collector.get_metrics(),
media_type="text/plain; version=0.0.4; charset=utf-8"
)
if __name__ == "__main__":
import uvicorn

View File

@@ -32,3 +32,13 @@ python-dateutil==2.8.2
# Authentication
python-jose[cryptography]==3.3.0
# Monitoring and Observability
prometheus-client==0.23.1
opentelemetry-api==1.27.0
opentelemetry-sdk==1.27.0
opentelemetry-instrumentation-fastapi==0.48b0
opentelemetry-exporter-otlp-proto-grpc==1.27.0
opentelemetry-instrumentation-httpx==0.48b0
opentelemetry-instrumentation-redis==0.48b0
opentelemetry-instrumentation-sqlalchemy==0.48b0

View File

@@ -3,16 +3,51 @@ Demo Session Service - Main Application
Manages isolated demo sessions with ephemeral data
"""
from fastapi import FastAPI, Request
from fastapi import FastAPI, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
import structlog
from contextlib import asynccontextmanager
import os
from app.core import settings, DatabaseManager
from app.api import demo_sessions, demo_accounts, demo_operations, internal
from shared.redis_utils import initialize_redis, close_redis
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
# OpenTelemetry imports
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "demo-session"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"),
insecure=True
)
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("demo-session")
# Setup logging
setup_logging("demo-session", getattr(settings, 'LOG_LEVEL', 'INFO'))
logger = structlog.get_logger()
# Initialize database
@@ -34,6 +69,10 @@ async def lifespan(app: FastAPI):
max_connections=50
)
# Start metrics server
metrics_collector.start_metrics_server(8080)
logger.info("Metrics server started on port 8080")
logger.info("Demo Session Service started successfully")
yield
@@ -52,6 +91,21 @@ app = FastAPI(
lifespan=lifespan
)
# Instrument FastAPI with OpenTelemetry
FastAPIInstrumentor.instrument_app(app)
# Instrument httpx for outgoing requests
HTTPXClientInstrumentor().instrument()
# Instrument Redis
RedisInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("demo-session")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# CORS middleware
app.add_middleware(
CORSMiddleware,
@@ -110,6 +164,15 @@ async def health():
}
@app.get("/metrics")
async def metrics():
"""Prometheus metrics endpoint"""
return Response(
content=metrics_collector.get_metrics(),
media_type="text/plain; version=0.0.4; charset=utf-8"
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(

View File

@@ -18,3 +18,11 @@ prometheus-client==0.23.1
aio-pika==9.4.3
email-validator==2.2.0
pytz==2024.2
# OpenTelemetry for distributed tracing
opentelemetry-api==1.27.0
opentelemetry-sdk==1.27.0
opentelemetry-instrumentation-fastapi==0.48b0
opentelemetry-exporter-otlp-proto-grpc==1.27.0
opentelemetry-instrumentation-httpx==0.48b0
opentelemetry-instrumentation-redis==0.48b0