Add signoz
This commit is contained in:
146
Tiltfile
146
Tiltfile
@@ -36,6 +36,11 @@ Security Features:
|
||||
✅ pgcrypto extension for encryption
|
||||
✅ PostgreSQL audit logging
|
||||
|
||||
Monitoring:
|
||||
📊 Service metrics available at /metrics endpoints
|
||||
🔍 Telemetry ready (traces, metrics, logs)
|
||||
ℹ️ SigNoz deployment optional for local dev (see signoz-info resource)
|
||||
|
||||
Applying security configurations...
|
||||
""")
|
||||
|
||||
@@ -303,82 +308,131 @@ k8s_resource('redis', resource_deps=['security-setup'], labels=['01-infrastructu
|
||||
k8s_resource('rabbitmq', labels=['01-infrastructure'])
|
||||
k8s_resource('nominatim', labels=['01-infrastructure'])
|
||||
|
||||
# =============================================================================
|
||||
# MONITORING RESOURCES - SigNoz (Unified Observability)
|
||||
# =============================================================================
|
||||
|
||||
# Note: SigNoz Helm chart is complex for local dev
|
||||
# For development, access SigNoz manually or use production Helm deployment
|
||||
# To deploy SigNoz manually: ./infrastructure/helm/deploy-signoz.sh dev
|
||||
local_resource(
|
||||
'signoz-info',
|
||||
cmd='''
|
||||
echo "📊 SigNoz Monitoring Information"
|
||||
echo ""
|
||||
echo "SigNoz Helm deployment is disabled for local development due to complexity."
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo "1. Deploy manually: ./infrastructure/helm/deploy-signoz.sh dev"
|
||||
echo "2. Use production deployment: ./infrastructure/helm/deploy-signoz.sh prod"
|
||||
echo "3. Skip monitoring for local development (use application metrics only)"
|
||||
echo ""
|
||||
echo "For simpler local monitoring, consider using just Prometheus+Grafana"
|
||||
echo "or access metrics directly from services at /metrics endpoints."
|
||||
''',
|
||||
labels=['05-monitoring'],
|
||||
auto_init=False,
|
||||
trigger_mode=TRIGGER_MODE_MANUAL
|
||||
)
|
||||
|
||||
# SigNoz ingress (only if manually deployed)
|
||||
# Uncomment and trigger manually if you deploy SigNoz
|
||||
# local_resource(
|
||||
# 'signoz-ingress',
|
||||
# cmd='''
|
||||
# echo "🌐 Applying SigNoz ingress..."
|
||||
# kubectl apply -f infrastructure/kubernetes/overlays/dev/signoz-ingress.yaml
|
||||
# echo "✅ SigNoz ingress configured"
|
||||
# ''',
|
||||
# labels=['05-monitoring'],
|
||||
# auto_init=False,
|
||||
# trigger_mode=TRIGGER_MODE_MANUAL
|
||||
# )
|
||||
|
||||
# Note: SigNoz components are managed by Helm and deployed outside of kustomize
|
||||
# They will appear automatically once deployed, but we don't track them explicitly in Tilt
|
||||
# to avoid startup errors. View them with: kubectl get pods -n signoz
|
||||
|
||||
# Optional exporters (in monitoring namespace)
|
||||
k8s_resource('node-exporter', labels=['05-monitoring'])
|
||||
k8s_resource('postgres-exporter', resource_deps=['auth-db'], labels=['05-monitoring'])
|
||||
|
||||
# =============================================================================
|
||||
# DATABASE RESOURCES
|
||||
# =============================================================================
|
||||
|
||||
# Core Service Databases
|
||||
k8s_resource('auth-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('tenant-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('auth-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
k8s_resource('tenant-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
|
||||
# Data & Analytics Databases
|
||||
k8s_resource('training-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('forecasting-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('ai-insights-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('training-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
k8s_resource('forecasting-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
k8s_resource('ai-insights-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
|
||||
# Operations Databases
|
||||
k8s_resource('sales-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('inventory-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('production-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('procurement-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('distribution-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('sales-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
k8s_resource('inventory-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
k8s_resource('production-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
k8s_resource('procurement-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
k8s_resource('distribution-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
|
||||
# Supporting Service Databases
|
||||
k8s_resource('recipes-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('suppliers-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('pos-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('orders-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('external-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('recipes-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
k8s_resource('suppliers-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
k8s_resource('pos-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
k8s_resource('orders-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
k8s_resource('external-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
|
||||
# Platform Service Databases
|
||||
k8s_resource('notification-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('alert-processor-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('orchestrator-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('notification-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
k8s_resource('alert-processor-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
k8s_resource('orchestrator-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
|
||||
# Demo Service Databases
|
||||
k8s_resource('demo-session-db', resource_deps=['security-setup'], labels=['02-databases'])
|
||||
k8s_resource('demo-session-db', resource_deps=['security-setup'], labels=['06-databases'])
|
||||
|
||||
# =============================================================================
|
||||
# MIGRATION JOBS
|
||||
# =============================================================================
|
||||
|
||||
# Core Service Migrations
|
||||
k8s_resource('auth-migration', resource_deps=['auth-db'], labels=['03-migrations'])
|
||||
k8s_resource('tenant-migration', resource_deps=['tenant-db'], labels=['03-migrations'])
|
||||
k8s_resource('auth-migration', resource_deps=['auth-db'], labels=['07-migrations'])
|
||||
k8s_resource('tenant-migration', resource_deps=['tenant-db'], labels=['07-migrations'])
|
||||
|
||||
# Data & Analytics Migrations
|
||||
k8s_resource('training-migration', resource_deps=['training-db'], labels=['03-migrations'])
|
||||
k8s_resource('forecasting-migration', resource_deps=['forecasting-db'], labels=['03-migrations'])
|
||||
k8s_resource('ai-insights-migration', resource_deps=['ai-insights-db'], labels=['03-migrations'])
|
||||
k8s_resource('training-migration', resource_deps=['training-db'], labels=['07-migrations'])
|
||||
k8s_resource('forecasting-migration', resource_deps=['forecasting-db'], labels=['07-migrations'])
|
||||
k8s_resource('ai-insights-migration', resource_deps=['ai-insights-db'], labels=['07-migrations'])
|
||||
|
||||
# Operations Migrations
|
||||
k8s_resource('sales-migration', resource_deps=['sales-db'], labels=['03-migrations'])
|
||||
k8s_resource('inventory-migration', resource_deps=['inventory-db'], labels=['03-migrations'])
|
||||
k8s_resource('production-migration', resource_deps=['production-db'], labels=['03-migrations'])
|
||||
k8s_resource('procurement-migration', resource_deps=['procurement-db'], labels=['03-migrations'])
|
||||
k8s_resource('distribution-migration', resource_deps=['distribution-db'], labels=['03-migrations'])
|
||||
k8s_resource('sales-migration', resource_deps=['sales-db'], labels=['07-migrations'])
|
||||
k8s_resource('inventory-migration', resource_deps=['inventory-db'], labels=['07-migrations'])
|
||||
k8s_resource('production-migration', resource_deps=['production-db'], labels=['07-migrations'])
|
||||
k8s_resource('procurement-migration', resource_deps=['procurement-db'], labels=['07-migrations'])
|
||||
k8s_resource('distribution-migration', resource_deps=['distribution-db'], labels=['07-migrations'])
|
||||
|
||||
# Supporting Service Migrations
|
||||
k8s_resource('recipes-migration', resource_deps=['recipes-db'], labels=['03-migrations'])
|
||||
k8s_resource('suppliers-migration', resource_deps=['suppliers-db'], labels=['03-migrations'])
|
||||
k8s_resource('pos-migration', resource_deps=['pos-db'], labels=['03-migrations'])
|
||||
k8s_resource('orders-migration', resource_deps=['orders-db'], labels=['03-migrations'])
|
||||
k8s_resource('external-migration', resource_deps=['external-db'], labels=['03-migrations'])
|
||||
k8s_resource('recipes-migration', resource_deps=['recipes-db'], labels=['07-migrations'])
|
||||
k8s_resource('suppliers-migration', resource_deps=['suppliers-db'], labels=['07-migrations'])
|
||||
k8s_resource('pos-migration', resource_deps=['pos-db'], labels=['07-migrations'])
|
||||
k8s_resource('orders-migration', resource_deps=['orders-db'], labels=['07-migrations'])
|
||||
k8s_resource('external-migration', resource_deps=['external-db'], labels=['07-migrations'])
|
||||
|
||||
# Platform Service Migrations
|
||||
k8s_resource('notification-migration', resource_deps=['notification-db'], labels=['03-migrations'])
|
||||
k8s_resource('alert-processor-migration', resource_deps=['alert-processor-db'], labels=['03-migrations'])
|
||||
k8s_resource('orchestrator-migration', resource_deps=['orchestrator-db'], labels=['03-migrations'])
|
||||
k8s_resource('notification-migration', resource_deps=['notification-db'], labels=['07-migrations'])
|
||||
k8s_resource('alert-processor-migration', resource_deps=['alert-processor-db'], labels=['07-migrations'])
|
||||
k8s_resource('orchestrator-migration', resource_deps=['orchestrator-db'], labels=['07-migrations'])
|
||||
|
||||
# Demo Service Migrations
|
||||
k8s_resource('demo-session-migration', resource_deps=['demo-session-db'], labels=['03-migrations'])
|
||||
k8s_resource('demo-session-migration', resource_deps=['demo-session-db'], labels=['07-migrations'])
|
||||
|
||||
# =============================================================================
|
||||
# DATA INITIALIZATION JOBS
|
||||
# =============================================================================
|
||||
|
||||
k8s_resource('external-data-init', resource_deps=['external-migration', 'redis'], labels=['04-data-init'])
|
||||
k8s_resource('nominatim-init', labels=['04-data-init'])
|
||||
k8s_resource('external-data-init', resource_deps=['external-migration', 'redis'], labels=['08-data-init'])
|
||||
k8s_resource('nominatim-init', labels=['08-data-init'])
|
||||
|
||||
# =============================================================================
|
||||
# =============================================================================
|
||||
@@ -517,8 +571,16 @@ Internal Schedulers Active:
|
||||
⏰ Usage Tracking: Daily @ 2:00 AM UTC (tenant-service)
|
||||
|
||||
Access your application:
|
||||
Frontend: http://localhost:3000 (or via ingress)
|
||||
Gateway: http://localhost:8000 (or via ingress)
|
||||
Main Application: https://localhost
|
||||
API Endpoints: https://localhost/api/v1/...
|
||||
|
||||
Service Metrics:
|
||||
Gateway: http://localhost:8000/metrics
|
||||
Any Service: kubectl port-forward <service> 8000:8000
|
||||
|
||||
SigNoz (Optional - see SIGNOZ_DEPLOYMENT_RECOMMENDATIONS.md):
|
||||
Deploy manually: ./infrastructure/helm/deploy-signoz.sh dev
|
||||
Access (if deployed): https://localhost/signoz
|
||||
|
||||
Verify security:
|
||||
kubectl get pvc -n bakery-ia
|
||||
|
||||
@@ -1,459 +0,0 @@
|
||||
# 🎉 Production Monitoring MVP - Implementation Complete
|
||||
|
||||
**Date:** 2026-01-07
|
||||
**Status:** ✅ READY FOR PRODUCTION DEPLOYMENT
|
||||
|
||||
---
|
||||
|
||||
## 📊 What Was Implemented
|
||||
|
||||
### **Phase 1: Core Infrastructure** ✅
|
||||
- ✅ **Prometheus v3.0.1** (2 replicas, HA mode with StatefulSet)
|
||||
- ✅ **AlertManager v0.27.0** (3 replicas, clustered with gossip protocol)
|
||||
- ✅ **Grafana v12.3.0** (secure credentials via Kubernetes Secrets)
|
||||
- ✅ **PostgreSQL Exporter v0.15.0** (database health monitoring)
|
||||
- ✅ **Node Exporter v1.7.0** (infrastructure monitoring via DaemonSet)
|
||||
- ✅ **Jaeger v1.51** (distributed tracing with persistent storage)
|
||||
|
||||
### **Phase 2: Alert Management** ✅
|
||||
- ✅ **50+ Alert Rules** across 9 categories:
|
||||
- Service health & performance
|
||||
- Business logic (ML training, API limits)
|
||||
- Alert system health & performance
|
||||
- Database & infrastructure alerts
|
||||
- Monitoring self-monitoring
|
||||
- ✅ **Intelligent Alert Routing** by severity, component, and service
|
||||
- ✅ **Alert Inhibition Rules** to prevent alert storms
|
||||
- ✅ **Multi-Channel Notifications** (email + Slack support)
|
||||
|
||||
### **Phase 3: High Availability** ✅
|
||||
- ✅ **PodDisruptionBudgets** for all monitoring components
|
||||
- ✅ **Anti-affinity Rules** to spread pods across nodes
|
||||
- ✅ **ResourceQuota & LimitRange** for namespace resource management
|
||||
- ✅ **StatefulSets** with volumeClaimTemplates for persistent storage
|
||||
- ✅ **Headless Services** for StatefulSet DNS discovery
|
||||
|
||||
### **Phase 4: Observability** ✅
|
||||
- ✅ **11 Grafana Dashboards** (7 pre-configured + 4 extended):
|
||||
1. Gateway Metrics
|
||||
2. Services Overview
|
||||
3. Circuit Breakers
|
||||
4. PostgreSQL Database (13 panels)
|
||||
5. Node Exporter Infrastructure (19 panels)
|
||||
6. AlertManager Monitoring (15 panels)
|
||||
7. Business Metrics & KPIs (21 panels)
|
||||
8-11. Plus existing dashboards
|
||||
- ✅ **Distributed Tracing** enabled in production
|
||||
- ✅ **Comprehensive Documentation** with runbooks
|
||||
|
||||
---
|
||||
|
||||
## 📁 Files Created/Modified
|
||||
|
||||
### **New Files:**
|
||||
```
|
||||
infrastructure/kubernetes/base/components/monitoring/
|
||||
├── secrets.yaml # Monitoring credentials
|
||||
├── alertmanager.yaml # AlertManager StatefulSet (3 replicas)
|
||||
├── alertmanager-init.yaml # Config initialization script
|
||||
├── alert-rules.yaml # 50+ alert rules
|
||||
├── postgres-exporter.yaml # PostgreSQL monitoring
|
||||
├── node-exporter.yaml # Infrastructure monitoring (DaemonSet)
|
||||
├── grafana-dashboards-extended.yaml # 4 comprehensive dashboards
|
||||
├── ha-policies.yaml # PDBs + ResourceQuota + LimitRange
|
||||
└── README.md # Complete documentation (500+ lines)
|
||||
```
|
||||
|
||||
### **Modified Files:**
|
||||
```
|
||||
infrastructure/kubernetes/base/components/monitoring/
|
||||
├── prometheus.yaml # Now StatefulSet with 2 replicas + alert config
|
||||
├── grafana.yaml # Using secrets + extended dashboards mounted
|
||||
├── ingress.yaml # Added /alertmanager path
|
||||
└── kustomization.yaml # Added all new resources
|
||||
|
||||
infrastructure/kubernetes/overlays/prod/
|
||||
├── kustomization.yaml # Enabled monitoring stack
|
||||
└── prod-configmap.yaml # JAEGER_ENABLED=true
|
||||
```
|
||||
|
||||
### **Deleted:**
|
||||
```
|
||||
infrastructure/monitoring/ # Old legacy config (completely removed)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Deployment Instructions
|
||||
|
||||
### **1. Update Secrets (REQUIRED BEFORE DEPLOYMENT)**
|
||||
|
||||
```bash
|
||||
cd infrastructure/kubernetes/base/components/monitoring
|
||||
|
||||
# Generate strong Grafana password
|
||||
GRAFANA_PASSWORD=$(openssl rand -base64 32)
|
||||
|
||||
# Update secrets.yaml with your actual values:
|
||||
# - grafana-admin: admin-password
|
||||
# - alertmanager-secrets: SMTP credentials
|
||||
# - postgres-exporter: PostgreSQL connection string
|
||||
|
||||
# Example for production:
|
||||
kubectl create secret generic grafana-admin \
|
||||
--from-literal=admin-user=admin \
|
||||
--from-literal=admin-password="${GRAFANA_PASSWORD}" \
|
||||
--namespace monitoring --dry-run=client -o yaml | \
|
||||
kubectl apply -f -
|
||||
```
|
||||
|
||||
### **2. Deploy to Production**
|
||||
|
||||
```bash
|
||||
# Apply the monitoring stack
|
||||
kubectl apply -k infrastructure/kubernetes/overlays/prod
|
||||
|
||||
# Verify deployment
|
||||
kubectl get pods -n monitoring
|
||||
kubectl get pvc -n monitoring
|
||||
kubectl get svc -n monitoring
|
||||
```
|
||||
|
||||
### **3. Verify Services**
|
||||
|
||||
```bash
|
||||
# Check Prometheus targets
|
||||
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
|
||||
# Visit: http://localhost:9090/targets
|
||||
|
||||
# Check AlertManager cluster
|
||||
kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093
|
||||
# Visit: http://localhost:9093
|
||||
|
||||
# Check Grafana dashboards
|
||||
kubectl port-forward -n monitoring svc/grafana 3000:3000
|
||||
# Visit: http://localhost:3000 (admin / YOUR_PASSWORD)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📈 What You Get Out of the Box
|
||||
|
||||
### **Monitoring Coverage:**
|
||||
- ✅ **Application Metrics:** Request rates, latencies (P95/P99), error rates per service
|
||||
- ✅ **Database Health:** Connections, transactions, cache hit ratio, slow queries, locks
|
||||
- ✅ **Infrastructure:** CPU, memory, disk I/O, network traffic per node
|
||||
- ✅ **Business KPIs:** Active tenants, training jobs, alert volumes, API health
|
||||
- ✅ **Distributed Traces:** Full request path tracking across microservices
|
||||
|
||||
### **Alerting Capabilities:**
|
||||
- ✅ **Service Down Detection:** 2-minute threshold with immediate notifications
|
||||
- ✅ **Performance Degradation:** High latency, error rate, and memory alerts
|
||||
- ✅ **Resource Exhaustion:** Database connections, disk space, memory limits
|
||||
- ✅ **Business Logic:** Training job failures, low ML accuracy, rate limits
|
||||
- ✅ **Alert System Health:** Component failures, delivery issues, capacity problems
|
||||
|
||||
### **High Availability:**
|
||||
- ✅ **Prometheus:** 2 independent instances, can lose 1 without data loss
|
||||
- ✅ **AlertManager:** 3-node cluster, requires 2/3 for alerts to fire
|
||||
- ✅ **Monitoring Resilience:** PodDisruptionBudgets ensure service during updates
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Configuration Highlights
|
||||
|
||||
### **Alert Routing (Configured in AlertManager):**
|
||||
|
||||
| Severity | Route | Repeat Interval |
|
||||
|----------|-------|-----------------|
|
||||
| Critical | critical-alerts@yourdomain.com + oncall@ | 4 hours |
|
||||
| Warning | alerts@yourdomain.com | 12 hours |
|
||||
| Info | alerts@yourdomain.com | 24 hours |
|
||||
|
||||
**Special Routes:**
|
||||
- Alert system → alert-system-team@yourdomain.com
|
||||
- Database alerts → database-team@yourdomain.com
|
||||
- Infrastructure → infra-team@yourdomain.com
|
||||
|
||||
### **Resource Allocation:**
|
||||
|
||||
| Component | Replicas | CPU Request | Memory Request | Storage |
|
||||
|-----------|----------|-------------|----------------|---------|
|
||||
| Prometheus | 2 | 500m | 1Gi | 20Gi × 2 |
|
||||
| AlertManager | 3 | 100m | 128Mi | 2Gi × 3 |
|
||||
| Grafana | 1 | 100m | 256Mi | 5Gi |
|
||||
| Postgres Exporter | 1 | 50m | 64Mi | - |
|
||||
| Node Exporter | 1/node | 50m | 64Mi | - |
|
||||
| Jaeger | 1 | 250m | 512Mi | 10Gi |
|
||||
|
||||
**Total Resources:**
|
||||
- CPU Requests: ~2.5 cores
|
||||
- Memory Requests: ~4Gi
|
||||
- Storage: ~70Gi
|
||||
|
||||
### **Data Retention:**
|
||||
- Prometheus: 30 days
|
||||
- Jaeger: Persistent (BadgerDB)
|
||||
- Grafana: Persistent dashboards
|
||||
|
||||
---
|
||||
|
||||
## 🔐 Security Considerations
|
||||
|
||||
### **Implemented:**
|
||||
- ✅ Grafana credentials via Kubernetes Secrets (no hardcoded passwords)
|
||||
- ✅ SMTP passwords stored in Secrets
|
||||
- ✅ PostgreSQL connection strings in Secrets
|
||||
- ✅ Read-only filesystem for Node Exporter
|
||||
- ✅ Non-root user for Node Exporter (UID 65534)
|
||||
- ✅ RBAC for Prometheus (ClusterRole with minimal permissions)
|
||||
|
||||
### **TODO for Production:**
|
||||
- ⚠️ Use Sealed Secrets or External Secrets Operator
|
||||
- ⚠️ Enable TLS for Prometheus remote write (if using)
|
||||
- ⚠️ Configure Grafana LDAP/OAuth integration
|
||||
- ⚠️ Set up proper certificate management for Ingress
|
||||
- ⚠️ Review and tighten ResourceQuota limits
|
||||
|
||||
---
|
||||
|
||||
## 📊 Dashboard Access
|
||||
|
||||
### **Production URLs (via Ingress):**
|
||||
```
|
||||
https://monitoring.yourdomain.com/grafana # Grafana UI
|
||||
https://monitoring.yourdomain.com/prometheus # Prometheus UI
|
||||
https://monitoring.yourdomain.com/alertmanager # AlertManager UI
|
||||
https://monitoring.yourdomain.com/jaeger # Jaeger UI
|
||||
```
|
||||
|
||||
### **Local Access (Port Forwarding):**
|
||||
```bash
|
||||
# Grafana
|
||||
kubectl port-forward -n monitoring svc/grafana 3000:3000
|
||||
|
||||
# Prometheus
|
||||
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
|
||||
|
||||
# AlertManager
|
||||
kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093
|
||||
|
||||
# Jaeger
|
||||
kubectl port-forward -n monitoring svc/jaeger-query 16686:16686
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🧪 Testing & Validation
|
||||
|
||||
### **1. Test Alert Flow:**
|
||||
```bash
|
||||
# Fire a test alert (HighMemoryUsage)
|
||||
kubectl run memory-hog --image=polinux/stress --restart=Never \
|
||||
--namespace=bakery-ia -- stress --vm 1 --vm-bytes 600M --timeout 300s
|
||||
|
||||
# Check alert in Prometheus (should fire within 5 minutes)
|
||||
# Check AlertManager received it
|
||||
# Verify email notification sent
|
||||
```
|
||||
|
||||
### **2. Verify Metrics Collection:**
|
||||
```bash
|
||||
# Check Prometheus targets (should all be UP)
|
||||
curl http://localhost:9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, health: .health}'
|
||||
|
||||
# Verify PostgreSQL metrics
|
||||
curl http://localhost:9090/api/v1/query?query=pg_up | jq
|
||||
|
||||
# Verify Node metrics
|
||||
curl http://localhost:9090/api/v1/query?query=node_cpu_seconds_total | jq
|
||||
```
|
||||
|
||||
### **3. Test Jaeger Tracing:**
|
||||
```bash
|
||||
# Make a request through the gateway
|
||||
curl -H "Authorization: Bearer YOUR_TOKEN" \
|
||||
https://api.yourdomain.com/api/v1/health
|
||||
|
||||
# Check trace in Jaeger UI
|
||||
# Should see spans across gateway → auth → tenant services
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📖 Documentation
|
||||
|
||||
### **Complete Documentation Available:**
|
||||
- **[README.md](infrastructure/kubernetes/base/components/monitoring/README.md)** - 500+ lines covering:
|
||||
- Component overview
|
||||
- Deployment instructions
|
||||
- Security best practices
|
||||
- Accessing services
|
||||
- Dashboard descriptions
|
||||
- Alert configuration
|
||||
- Troubleshooting guide
|
||||
- Metrics reference
|
||||
- Backup & recovery procedures
|
||||
- Maintenance tasks
|
||||
|
||||
---
|
||||
|
||||
## ⚡ Performance & Scalability
|
||||
|
||||
### **Current Capacity:**
|
||||
- Prometheus can handle ~10M active time series
|
||||
- AlertManager can process 1000s of alerts/second
|
||||
- Jaeger can handle 10k spans/second
|
||||
- Grafana supports 1000+ concurrent users
|
||||
|
||||
### **Scaling Recommendations:**
|
||||
- **> 20M time series:** Deploy Thanos for long-term storage
|
||||
- **> 5k alerts/min:** Scale AlertManager to 5+ replicas
|
||||
- **> 50k spans/sec:** Deploy Jaeger with Elasticsearch/Cassandra backend
|
||||
- **> 5k Grafana users:** Scale Grafana horizontally with shared database
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Success Criteria - ALL MET ✅
|
||||
|
||||
- ✅ Prometheus collecting metrics from all services
|
||||
- ✅ Alert rules evaluating and firing correctly
|
||||
- ✅ AlertManager routing notifications to appropriate channels
|
||||
- ✅ Grafana displaying real-time dashboards
|
||||
- ✅ Jaeger capturing distributed traces
|
||||
- ✅ High availability for all critical components
|
||||
- ✅ Secure credential management
|
||||
- ✅ Resource limits configured
|
||||
- ✅ Documentation complete with runbooks
|
||||
- ✅ No legacy code remaining
|
||||
|
||||
---
|
||||
|
||||
## 🚨 Important Notes
|
||||
|
||||
1. **Update Secrets Before Deployment:**
|
||||
- Change all default passwords in `secrets.yaml`
|
||||
- Use strong, randomly generated passwords
|
||||
- Consider using Sealed Secrets for production
|
||||
|
||||
2. **Configure SMTP Settings:**
|
||||
- Update AlertManager SMTP configuration in secrets
|
||||
- Test email delivery before relying on alerts
|
||||
|
||||
3. **Review Alert Thresholds:**
|
||||
- Current thresholds are conservative
|
||||
- Adjust based on your SLAs and baseline metrics
|
||||
|
||||
4. **Monitor Resource Usage:**
|
||||
- Prometheus storage grows over time
|
||||
- Plan for capacity based on retention period
|
||||
- Consider cleaning up old metrics
|
||||
|
||||
5. **Backup Strategy:**
|
||||
- PVCs contain critical monitoring data
|
||||
- Implement backup solution for PersistentVolumes
|
||||
- Test restore procedures regularly
|
||||
|
||||
---
|
||||
|
||||
## 🎓 Next Steps (Post-MVP)
|
||||
|
||||
### **Short Term (1-2 weeks):**
|
||||
1. Fine-tune alert thresholds based on production data
|
||||
2. Add custom business metrics to services
|
||||
3. Create team-specific dashboards
|
||||
4. Set up on-call rotation in AlertManager
|
||||
|
||||
### **Medium Term (1-3 months):**
|
||||
1. Implement SLO tracking and error budgets
|
||||
2. Deploy Loki for log aggregation
|
||||
3. Add anomaly detection for metrics
|
||||
4. Integrate with incident management (PagerDuty/Opsgenie)
|
||||
|
||||
### **Long Term (3-6 months):**
|
||||
1. Deploy Thanos for long-term metrics storage
|
||||
2. Implement cost tracking and chargeback per tenant
|
||||
3. Add continuous profiling (Pyroscope)
|
||||
4. Build ML-based alert prediction
|
||||
|
||||
---
|
||||
|
||||
## 📞 Support & Troubleshooting
|
||||
|
||||
### **Common Issues:**
|
||||
|
||||
**Issue:** Prometheus targets showing "DOWN"
|
||||
```bash
|
||||
# Check service discovery
|
||||
kubectl get svc -n bakery-ia
|
||||
kubectl get endpoints -n bakery-ia
|
||||
```
|
||||
|
||||
**Issue:** AlertManager not sending notifications
|
||||
```bash
|
||||
# Check SMTP connectivity
|
||||
kubectl exec -n monitoring alertmanager-0 -- nc -zv smtp.gmail.com 587
|
||||
|
||||
# Check AlertManager logs
|
||||
kubectl logs -n monitoring alertmanager-0 -f
|
||||
```
|
||||
|
||||
**Issue:** Grafana dashboards showing "No Data"
|
||||
```bash
|
||||
# Verify Prometheus datasource
|
||||
kubectl port-forward -n monitoring svc/grafana 3000:3000
|
||||
# Login → Configuration → Data Sources → Test
|
||||
|
||||
# Check Prometheus has data
|
||||
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
|
||||
# Visit /graph and run query: up
|
||||
```
|
||||
|
||||
### **Getting Help:**
|
||||
- Check logs: `kubectl logs -n monitoring POD_NAME`
|
||||
- Check events: `kubectl get events -n monitoring`
|
||||
- Review documentation: `infrastructure/kubernetes/base/components/monitoring/README.md`
|
||||
- Prometheus troubleshooting: https://prometheus.io/docs/prometheus/latest/troubleshooting/
|
||||
- Grafana troubleshooting: https://grafana.com/docs/grafana/latest/troubleshooting/
|
||||
|
||||
---
|
||||
|
||||
## ✅ Deployment Checklist
|
||||
|
||||
Before going to production, verify:
|
||||
|
||||
- [ ] All secrets updated with production values
|
||||
- [ ] SMTP configuration tested and working
|
||||
- [ ] Grafana admin password changed from default
|
||||
- [ ] PostgreSQL connection string configured
|
||||
- [ ] Test alert fired and received via email
|
||||
- [ ] All Prometheus targets are UP
|
||||
- [ ] Grafana dashboards loading data
|
||||
- [ ] Jaeger receiving traces
|
||||
- [ ] Resource quotas appropriate for cluster size
|
||||
- [ ] Backup strategy implemented for PVCs
|
||||
- [ ] Team trained on accessing monitoring tools
|
||||
- [ ] Runbooks reviewed and understood
|
||||
- [ ] On-call rotation configured (if applicable)
|
||||
|
||||
---
|
||||
|
||||
## 🎉 Summary
|
||||
|
||||
**You now have a production-ready monitoring stack with:**
|
||||
|
||||
- ✅ **Complete Observability:** Metrics, logs (via stdout), and traces
|
||||
- ✅ **Intelligent Alerting:** 50+ rules with smart routing and inhibition
|
||||
- ✅ **Rich Visualization:** 11 dashboards covering all aspects of the system
|
||||
- ✅ **High Availability:** HA for Prometheus and AlertManager
|
||||
- ✅ **Security:** Secrets management, RBAC, read-only containers
|
||||
- ✅ **Documentation:** Comprehensive guides and runbooks
|
||||
- ✅ **Scalability:** Ready to handle production traffic
|
||||
|
||||
**The monitoring MVP is COMPLETE and READY FOR PRODUCTION DEPLOYMENT!** 🚀
|
||||
|
||||
---
|
||||
|
||||
*Generated: 2026-01-07*
|
||||
*Version: 1.0.0 - Production MVP*
|
||||
*Implementation Time: ~3 hours*
|
||||
@@ -584,23 +584,39 @@ docker push YOUR_VPS_IP:32000/bakery/auth-service
|
||||
|
||||
### Step 2: Update Production Configuration
|
||||
|
||||
```bash
|
||||
# On local machine, edit these files:
|
||||
The production configuration is already set up for **bakewise.ai** domain:
|
||||
|
||||
**Production URLs:**
|
||||
- **Main Application:** https://bakewise.ai
|
||||
- **API Endpoints:** https://bakewise.ai/api/v1/...
|
||||
- **Monitoring Dashboard:** https://monitoring.bakewise.ai/grafana
|
||||
- **Prometheus:** https://monitoring.bakewise.ai/prometheus
|
||||
- **SigNoz (Traces/Metrics/Logs):** https://monitoring.bakewise.ai/signoz
|
||||
- **AlertManager:** https://monitoring.bakewise.ai/alertmanager
|
||||
|
||||
```bash
|
||||
# Verify the configuration is correct:
|
||||
cat infrastructure/kubernetes/overlays/prod/prod-ingress.yaml | grep -A 3 "host:"
|
||||
|
||||
# Expected output should show:
|
||||
# - host: bakewise.ai
|
||||
# - host: monitoring.bakewise.ai
|
||||
|
||||
# Verify CORS configuration
|
||||
cat infrastructure/kubernetes/overlays/prod/prod-configmap.yaml | grep CORS
|
||||
|
||||
# Expected: CORS_ORIGINS: "https://bakewise.ai"
|
||||
```
|
||||
|
||||
**If using a different domain**, update these files:
|
||||
```bash
|
||||
# 1. Update domain names
|
||||
nano infrastructure/kubernetes/overlays/prod/prod-ingress.yaml
|
||||
# Replace:
|
||||
# - bakery.yourdomain.com → bakery.your-actual-domain.com
|
||||
# - api.yourdomain.com → api.your-actual-domain.com
|
||||
# - monitoring.yourdomain.com → monitoring.your-actual-domain.com
|
||||
# - Update CORS origins
|
||||
# - Update cert-manager email
|
||||
# Replace bakewise.ai with your domain
|
||||
|
||||
# 2. Update ConfigMap
|
||||
nano infrastructure/kubernetes/overlays/prod/prod-configmap.yaml
|
||||
# Set:
|
||||
# - DOMAIN: "your-actual-domain.com"
|
||||
# - CORS_ORIGINS: "https://bakery.your-actual-domain.com,https://www.your-actual-domain.com"
|
||||
# Update CORS_ORIGINS
|
||||
|
||||
# 3. Verify image names (if using custom registry)
|
||||
nano infrastructure/kubernetes/overlays/prod/kustomization.yaml
|
||||
@@ -840,22 +856,96 @@ kubectl logs -n bakery-ia deployment/auth-service | grep -i "email\|smtp"
|
||||
|
||||
## Post-Deployment
|
||||
|
||||
### Step 1: Enable Monitoring
|
||||
### Step 1: Access Monitoring Stack
|
||||
|
||||
```bash
|
||||
# Monitoring is already configured, verify it's running
|
||||
kubectl get pods -n monitoring
|
||||
Your production monitoring stack provides complete observability with multiple tools:
|
||||
|
||||
# Access Grafana
|
||||
kubectl port-forward -n monitoring svc/grafana 3000:3000
|
||||
#### Production Monitoring URLs
|
||||
|
||||
# Visit http://localhost:3000
|
||||
# Login: admin / (password from monitoring secrets)
|
||||
|
||||
# Check dashboards are working
|
||||
Access via domain (recommended):
|
||||
```
|
||||
https://monitoring.bakewise.ai/grafana # Dashboards & visualization
|
||||
https://monitoring.bakewise.ai/prometheus # Metrics & queries
|
||||
https://monitoring.bakewise.ai/signoz # Unified observability platform (traces, metrics, logs)
|
||||
https://monitoring.bakewise.ai/alertmanager # Alert management
|
||||
```
|
||||
|
||||
### Step 2: Configure Backups
|
||||
Or via port forwarding (if needed):
|
||||
```bash
|
||||
# Grafana
|
||||
kubectl port-forward -n monitoring svc/grafana 3000:3000 &
|
||||
|
||||
# Prometheus
|
||||
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 &
|
||||
|
||||
# SigNoz
|
||||
kubectl port-forward -n monitoring svc/signoz-frontend 3301:3301 &
|
||||
|
||||
# AlertManager
|
||||
kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093 &
|
||||
```
|
||||
|
||||
#### Available Dashboards
|
||||
|
||||
Login to Grafana (admin / your-password) and explore:
|
||||
|
||||
**Main Dashboards:**
|
||||
1. **Gateway Metrics** - HTTP request rates, latencies, error rates
|
||||
2. **Services Overview** - Multi-service health and performance
|
||||
3. **Circuit Breakers** - Reliability metrics
|
||||
|
||||
**Extended Dashboards:**
|
||||
4. **Service Performance Monitoring (SPM)** - RED metrics from distributed traces
|
||||
5. **PostgreSQL Database** - Database health, connections, query performance
|
||||
6. **Node Exporter Infrastructure** - CPU, memory, disk, network per node
|
||||
7. **AlertManager Monitoring** - Alert tracking and notification status
|
||||
8. **Business Metrics & KPIs** - Tenant activity, ML jobs, forecasts
|
||||
|
||||
#### Quick Health Check
|
||||
|
||||
```bash
|
||||
# Verify all monitoring pods are running
|
||||
kubectl get pods -n monitoring
|
||||
|
||||
# Check Prometheus targets (all should be UP)
|
||||
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
|
||||
# Open: http://localhost:9090/targets
|
||||
|
||||
# View active alerts
|
||||
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
|
||||
# Open: http://localhost:9090/alerts
|
||||
```
|
||||
|
||||
### Step 2: Configure Alerting
|
||||
|
||||
Update AlertManager with your notification email addresses:
|
||||
|
||||
```bash
|
||||
# Edit alertmanager configuration
|
||||
kubectl edit configmap -n monitoring alertmanager-config
|
||||
|
||||
# Update recipient emails in the routes section:
|
||||
# - alerts@bakewise.ai (general alerts)
|
||||
# - critical-alerts@bakewise.ai (critical issues)
|
||||
# - oncall@bakewise.ai (on-call rotation)
|
||||
```
|
||||
|
||||
Test alert delivery:
|
||||
```bash
|
||||
# Fire a test alert
|
||||
kubectl run memory-test --image=polinux/stress --restart=Never \
|
||||
--namespace=bakery-ia -- stress --vm 1 --vm-bytes 600M --timeout 300s
|
||||
|
||||
# Check alert appears in AlertManager
|
||||
# https://monitoring.bakewise.ai/alertmanager
|
||||
|
||||
# Verify email notification received
|
||||
|
||||
# Clean up test
|
||||
kubectl delete pod memory-test -n bakery-ia
|
||||
```
|
||||
|
||||
### Step 3: Configure Backups
|
||||
|
||||
```bash
|
||||
# Create backup script on VPS
|
||||
@@ -902,26 +992,82 @@ kubectl edit configmap -n monitoring alertmanager-config
|
||||
# Update recipient emails in the routes section
|
||||
```
|
||||
|
||||
### Step 4: Document Everything
|
||||
### Step 4: Verify Monitoring is Working
|
||||
|
||||
Create a runbook with:
|
||||
- [ ] VPS login credentials (stored securely)
|
||||
Before proceeding, ensure all monitoring components are operational:
|
||||
|
||||
```bash
|
||||
# 1. Check Prometheus targets
|
||||
# Open: https://monitoring.bakewise.ai/prometheus/targets
|
||||
# All targets should show "UP" status
|
||||
|
||||
# 2. Verify Grafana dashboards load data
|
||||
# Open: https://monitoring.bakewise.ai/grafana
|
||||
# Navigate to any dashboard and verify metrics are displaying
|
||||
|
||||
# 3. Check SigNoz is receiving traces
|
||||
# Open: https://monitoring.bakewise.ai/signoz
|
||||
# Search for traces from "gateway" service
|
||||
|
||||
# 4. Verify AlertManager cluster
|
||||
# Open: https://monitoring.bakewise.ai/alertmanager
|
||||
# Check that all 3 AlertManager instances are connected
|
||||
```
|
||||
|
||||
### Step 5: Document Everything
|
||||
|
||||
Create a secure runbook with all credentials and procedures:
|
||||
|
||||
**Essential Information to Document:**
|
||||
- [ ] VPS login credentials (stored securely in password manager)
|
||||
- [ ] Database passwords (in password manager)
|
||||
- [ ] Domain registrar access
|
||||
- [ ] Grafana admin password
|
||||
- [ ] Domain registrar access (for bakewise.ai)
|
||||
- [ ] Cloudflare access
|
||||
- [ ] Email service credentials
|
||||
- [ ] Email service credentials (SMTP)
|
||||
- [ ] WhatsApp API credentials
|
||||
- [ ] Docker Hub / Registry credentials
|
||||
- [ ] Emergency contact information
|
||||
- [ ] Rollback procedures
|
||||
- [ ] Monitoring URLs and access procedures
|
||||
|
||||
### Step 5: Train Your Team
|
||||
### Step 6: Train Your Team
|
||||
|
||||
- [ ] Show team how to access Grafana dashboards
|
||||
- [ ] Demonstrate how to check logs: `kubectl logs`
|
||||
- [ ] Explain how to restart services if needed
|
||||
- [ ] Share this documentation with the team
|
||||
- [ ] Setup on-call rotation (if applicable)
|
||||
Conduct a training session covering:
|
||||
|
||||
- [ ] **Access monitoring dashboards**
|
||||
- Show how to login to https://monitoring.bakewise.ai/grafana
|
||||
- Walk through key dashboards (Services Overview, Database, Infrastructure)
|
||||
- Explain how to interpret metrics and identify issues
|
||||
|
||||
- [ ] **Check application logs**
|
||||
```bash
|
||||
# View logs for a service
|
||||
kubectl logs -n bakery-ia deployment/orders-service --tail=100 -f
|
||||
|
||||
# Search for errors
|
||||
kubectl logs -n bakery-ia deployment/gateway | grep ERROR
|
||||
```
|
||||
|
||||
- [ ] **Restart services when needed**
|
||||
```bash
|
||||
# Restart a service (rolling update, no downtime)
|
||||
kubectl rollout restart deployment/orders-service -n bakery-ia
|
||||
```
|
||||
|
||||
- [ ] **Respond to alerts**
|
||||
- Show how to access AlertManager at https://monitoring.bakewise.ai/alertmanager
|
||||
- Review common alerts and their resolution steps
|
||||
- Reference the [Production Operations Guide](./PRODUCTION_OPERATIONS_GUIDE.md)
|
||||
|
||||
- [ ] **Share documentation**
|
||||
- [PILOT_LAUNCH_GUIDE.md](./PILOT_LAUNCH_GUIDE.md) - This guide
|
||||
- [PRODUCTION_OPERATIONS_GUIDE.md](./PRODUCTION_OPERATIONS_GUIDE.md) - Daily operations
|
||||
- [security-checklist.md](./security-checklist.md) - Security procedures
|
||||
|
||||
- [ ] **Setup on-call rotation** (if applicable)
|
||||
- Configure in AlertManager
|
||||
- Document escalation procedures
|
||||
|
||||
---
|
||||
|
||||
@@ -1050,16 +1196,25 @@ kubectl scale deployment monitoring -n bakery-ia --replicas=0
|
||||
|
||||
## Support Resources
|
||||
|
||||
- **Full Monitoring Guide:** [MONITORING_DEPLOYMENT_SUMMARY.md](./MONITORING_DEPLOYMENT_SUMMARY.md)
|
||||
- **Operations Guide:** [PRODUCTION_OPERATIONS_GUIDE.md](./PRODUCTION_OPERATIONS_GUIDE.md)
|
||||
- **Security Guide:** [security-checklist.md](./security-checklist.md)
|
||||
- **Database Security:** [database-security.md](./database-security.md)
|
||||
- **TLS Configuration:** [tls-configuration.md](./tls-configuration.md)
|
||||
**Documentation:**
|
||||
- **Operations Guide:** [PRODUCTION_OPERATIONS_GUIDE.md](./PRODUCTION_OPERATIONS_GUIDE.md) - Daily operations, monitoring, incident response
|
||||
- **Security Guide:** [security-checklist.md](./security-checklist.md) - Security procedures and compliance
|
||||
- **Database Security:** [database-security.md](./database-security.md) - Database operations and TLS configuration
|
||||
- **TLS Configuration:** [tls-configuration.md](./tls-configuration.md) - Certificate management
|
||||
- **RBAC Implementation:** [rbac-implementation.md](./rbac-implementation.md) - Access control
|
||||
|
||||
**Monitoring Access:**
|
||||
- **Grafana:** https://monitoring.bakewise.ai/grafana (admin / your-password)
|
||||
- **Prometheus:** https://monitoring.bakewise.ai/prometheus
|
||||
- **SigNoz:** https://monitoring.bakewise.ai/signoz
|
||||
- **AlertManager:** https://monitoring.bakewise.ai/alertmanager
|
||||
|
||||
**External Resources:**
|
||||
- **MicroK8s Docs:** https://microk8s.io/docs
|
||||
- **Kubernetes Docs:** https://kubernetes.io/docs
|
||||
- **Let's Encrypt:** https://letsencrypt.org/docs
|
||||
- **Cloudflare DNS:** https://developers.cloudflare.com/dns
|
||||
- **Monitoring Stack README:** infrastructure/kubernetes/base/components/monitoring/README.md
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
- **Services:** 18 microservices, 14 databases, monitoring stack
|
||||
- **Capacity:** 10-tenant pilot (scalable to 100+)
|
||||
- **Security:** TLS encryption, RBAC, audit logging
|
||||
- **Monitoring:** Prometheus, Grafana, AlertManager, Jaeger
|
||||
- **Monitoring:** Prometheus, Grafana, AlertManager, SigNoz
|
||||
|
||||
**Key Metrics (10-tenant baseline):**
|
||||
- **Uptime Target:** 99.5% (3.65 hours downtime/month)
|
||||
@@ -60,10 +60,10 @@
|
||||
|
||||
**Production URLs:**
|
||||
```
|
||||
https://monitoring.yourdomain.com/grafana # Dashboards & visualization
|
||||
https://monitoring.yourdomain.com/prometheus # Metrics & alerts
|
||||
https://monitoring.yourdomain.com/alertmanager # Alert management
|
||||
https://monitoring.yourdomain.com/jaeger # Distributed tracing
|
||||
https://monitoring.bakewise.ai/grafana # Dashboards & visualization
|
||||
https://monitoring.bakewise.ai/prometheus # Metrics & alerts
|
||||
https://monitoring.bakewise.ai/alertmanager # Alert management
|
||||
https://monitoring.bakewise.ai/signoz # Unified observability platform (traces, metrics, logs)
|
||||
```
|
||||
|
||||
**Port Forwarding (if ingress not available):**
|
||||
@@ -77,8 +77,8 @@ kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
|
||||
# AlertManager
|
||||
kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093
|
||||
|
||||
# Jaeger
|
||||
kubectl port-forward -n monitoring svc/jaeger-query 16686:16686
|
||||
# SigNoz
|
||||
kubectl port-forward -n monitoring svc/signoz-frontend 3301:3301
|
||||
```
|
||||
|
||||
### Key Dashboards
|
||||
@@ -1099,13 +1099,12 @@ kubectl exec -n bakery-ia deployment/auth-db -- \
|
||||
## Support Resources
|
||||
|
||||
**Documentation:**
|
||||
- [Pilot Launch Guide](./PILOT_LAUNCH_GUIDE.md) - Initial deployment
|
||||
- [Monitoring Summary](./MONITORING_DEPLOYMENT_SUMMARY.md) - Monitoring details
|
||||
- [Quick Start Monitoring](./QUICK_START_MONITORING.md) - Monitoring setup
|
||||
- [Security Checklist](./security-checklist.md) - Security procedures
|
||||
- [Database Security](./database-security.md) - Database operations
|
||||
- [Pilot Launch Guide](./PILOT_LAUNCH_GUIDE.md) - Initial deployment and setup
|
||||
- [Security Checklist](./security-checklist.md) - Security procedures and compliance
|
||||
- [Database Security](./database-security.md) - Database operations and best practices
|
||||
- [TLS Configuration](./tls-configuration.md) - Certificate management
|
||||
- [RBAC Implementation](./rbac-implementation.md) - Access control
|
||||
- [RBAC Implementation](./rbac-implementation.md) - Access control configuration
|
||||
- [Monitoring Stack README](../infrastructure/kubernetes/base/components/monitoring/README.md) - Detailed monitoring documentation
|
||||
|
||||
**External Resources:**
|
||||
- Kubernetes: https://kubernetes.io/docs
|
||||
@@ -1115,9 +1114,9 @@ kubectl exec -n bakery-ia deployment/auth-db -- \
|
||||
- PostgreSQL: https://www.postgresql.org/docs
|
||||
|
||||
**Emergency Contacts:**
|
||||
- DevOps Team: devops@yourdomain.com
|
||||
- On-Call: oncall@yourdomain.com
|
||||
- Security Team: security@yourdomain.com
|
||||
- DevOps Team: devops@bakewise.ai
|
||||
- On-Call: oncall@bakewise.ai
|
||||
- Security Team: security@bakewise.ai
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -1,284 +0,0 @@
|
||||
# 🚀 Quick Start: Deploy Monitoring to Production
|
||||
|
||||
**Time to deploy: ~15 minutes**
|
||||
|
||||
---
|
||||
|
||||
## Step 1: Update Secrets (5 min)
|
||||
|
||||
```bash
|
||||
cd infrastructure/kubernetes/base/components/monitoring
|
||||
|
||||
# 1. Generate strong passwords
|
||||
GRAFANA_PASS=$(openssl rand -base64 32)
|
||||
echo "Grafana Password: $GRAFANA_PASS" > ~/SAVE_THIS_PASSWORD.txt
|
||||
|
||||
# 2. Edit secrets.yaml and replace:
|
||||
# - CHANGE_ME_IN_PRODUCTION (Grafana password)
|
||||
# - SMTP settings (your email server)
|
||||
# - PostgreSQL connection string (your DB)
|
||||
|
||||
nano secrets.yaml
|
||||
```
|
||||
|
||||
**Required Changes in secrets.yaml:**
|
||||
```yaml
|
||||
# Line 13: Change Grafana password
|
||||
admin-password: "YOUR_STRONG_PASSWORD_HERE"
|
||||
|
||||
# Lines 30-33: Update SMTP settings
|
||||
smtp-host: "smtp.gmail.com:587"
|
||||
smtp-username: "your-alerts@yourdomain.com"
|
||||
smtp-password: "YOUR_SMTP_PASSWORD"
|
||||
smtp-from: "alerts@yourdomain.com"
|
||||
|
||||
# Line 49: Update PostgreSQL connection
|
||||
data-source-name: "postgresql://USER:PASSWORD@postgres.bakery-ia:5432/bakery?sslmode=require"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 2: Update Alert Email Addresses (2 min)
|
||||
|
||||
```bash
|
||||
# Edit alertmanager.yaml to set your team's email addresses
|
||||
nano alertmanager.yaml
|
||||
|
||||
# Update these lines (search for @yourdomain.com):
|
||||
# - Line 93: to: 'alerts@yourdomain.com'
|
||||
# - Line 101: to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com'
|
||||
# - Line 116: to: 'alerts@yourdomain.com'
|
||||
# - Line 125: to: 'alert-system-team@yourdomain.com'
|
||||
# - Line 134: to: 'database-team@yourdomain.com'
|
||||
# - Line 143: to: 'infra-team@yourdomain.com'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 3: Deploy to Production (3 min)
|
||||
|
||||
```bash
|
||||
# Return to project root
|
||||
cd /Users/urtzialfaro/Documents/bakery-ia
|
||||
|
||||
# Deploy the entire stack
|
||||
kubectl apply -k infrastructure/kubernetes/overlays/prod
|
||||
|
||||
# Watch the pods come up
|
||||
kubectl get pods -n monitoring -w
|
||||
```
|
||||
|
||||
**Expected Output:**
|
||||
```
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
prometheus-0 1/1 Running 0 2m
|
||||
prometheus-1 1/1 Running 0 1m
|
||||
alertmanager-0 2/2 Running 0 2m
|
||||
alertmanager-1 2/2 Running 0 1m
|
||||
alertmanager-2 2/2 Running 0 1m
|
||||
grafana-xxxxx 1/1 Running 0 2m
|
||||
postgres-exporter-xxxxx 1/1 Running 0 2m
|
||||
node-exporter-xxxxx 1/1 Running 0 2m
|
||||
jaeger-xxxxx 1/1 Running 0 2m
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 4: Verify Deployment (3 min)
|
||||
|
||||
```bash
|
||||
# Check all pods are running
|
||||
kubectl get pods -n monitoring
|
||||
|
||||
# Check storage is provisioned
|
||||
kubectl get pvc -n monitoring
|
||||
|
||||
# Check services are created
|
||||
kubectl get svc -n monitoring
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 5: Access Dashboards (2 min)
|
||||
|
||||
### **Option A: Via Ingress (if configured)**
|
||||
```
|
||||
https://monitoring.yourdomain.com/grafana
|
||||
https://monitoring.yourdomain.com/prometheus
|
||||
https://monitoring.yourdomain.com/alertmanager
|
||||
https://monitoring.yourdomain.com/jaeger
|
||||
```
|
||||
|
||||
### **Option B: Via Port Forwarding**
|
||||
```bash
|
||||
# Grafana
|
||||
kubectl port-forward -n monitoring svc/grafana 3000:3000 &
|
||||
|
||||
# Prometheus
|
||||
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090 &
|
||||
|
||||
# AlertManager
|
||||
kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093 &
|
||||
|
||||
# Jaeger
|
||||
kubectl port-forward -n monitoring svc/jaeger-query 16686:16686 &
|
||||
|
||||
# Now access:
|
||||
# - Grafana: http://localhost:3000 (admin / YOUR_PASSWORD)
|
||||
# - Prometheus: http://localhost:9090
|
||||
# - AlertManager: http://localhost:9093
|
||||
# - Jaeger: http://localhost:16686
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 6: Verify Everything Works (5 min)
|
||||
|
||||
### **Check Prometheus Targets**
|
||||
1. Open Prometheus: http://localhost:9090
|
||||
2. Go to Status → Targets
|
||||
3. Verify all targets are **UP**:
|
||||
- prometheus (1/1 up)
|
||||
- bakery-services (multiple pods up)
|
||||
- alertmanager (3/3 up)
|
||||
- postgres-exporter (1/1 up)
|
||||
- node-exporter (N/N up, where N = number of nodes)
|
||||
|
||||
### **Check Grafana Dashboards**
|
||||
1. Open Grafana: http://localhost:3000
|
||||
2. Login with admin / YOUR_PASSWORD
|
||||
3. Go to Dashboards → Browse
|
||||
4. You should see 11 dashboards:
|
||||
- Bakery IA folder: Gateway Metrics, Services Overview, Circuit Breakers
|
||||
- Bakery IA - Extended folder: PostgreSQL, Node Exporter, AlertManager, Business Metrics
|
||||
5. Open any dashboard and verify data is loading
|
||||
|
||||
### **Test Alert Flow**
|
||||
```bash
|
||||
# Fire a test alert by creating high memory pod
|
||||
kubectl run memory-test --image=polinux/stress --restart=Never \
|
||||
--namespace=bakery-ia -- stress --vm 1 --vm-bytes 600M --timeout 300s
|
||||
|
||||
# Wait 5 minutes, then check:
|
||||
# 1. Prometheus Alerts: http://localhost:9090/alerts
|
||||
# - Should see "HighMemoryUsage" firing
|
||||
# 2. AlertManager: http://localhost:9093
|
||||
# - Should see the alert
|
||||
# 3. Email inbox - Should receive notification
|
||||
|
||||
# Clean up
|
||||
kubectl delete pod memory-test -n bakery-ia
|
||||
```
|
||||
|
||||
### **Verify Jaeger Tracing**
|
||||
1. Make a request to your API:
|
||||
```bash
|
||||
curl -H "Authorization: Bearer YOUR_TOKEN" \
|
||||
https://api.yourdomain.com/api/v1/health
|
||||
```
|
||||
2. Open Jaeger: http://localhost:16686
|
||||
3. Select a service from dropdown
|
||||
4. Click "Find Traces"
|
||||
5. You should see traces appearing
|
||||
|
||||
---
|
||||
|
||||
## ✅ Success Criteria
|
||||
|
||||
Your monitoring is working correctly if:
|
||||
|
||||
- [x] All Prometheus targets show "UP" status
|
||||
- [x] Grafana dashboards display metrics
|
||||
- [x] AlertManager cluster shows 3/3 members
|
||||
- [x] Test alert fired and email received
|
||||
- [x] Jaeger shows traces from services
|
||||
- [x] No pods in CrashLoopBackOff state
|
||||
- [x] All PVCs are Bound
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Troubleshooting
|
||||
|
||||
### **Problem: Pods not starting**
|
||||
```bash
|
||||
# Check pod status
|
||||
kubectl describe pod POD_NAME -n monitoring
|
||||
|
||||
# Check logs
|
||||
kubectl logs POD_NAME -n monitoring
|
||||
|
||||
# Common issues:
|
||||
# - Insufficient resources: Check node capacity
|
||||
# - PVC not binding: Check storage class exists
|
||||
# - Image pull errors: Check network/registry access
|
||||
```
|
||||
|
||||
### **Problem: Prometheus targets DOWN**
|
||||
```bash
|
||||
# Check if services exist
|
||||
kubectl get svc -n bakery-ia
|
||||
|
||||
# Check if pods have correct labels
|
||||
kubectl get pods -n bakery-ia --show-labels
|
||||
|
||||
# Check if pods expose metrics port (8080)
|
||||
kubectl get pod POD_NAME -n bakery-ia -o yaml | grep -A 5 ports
|
||||
```
|
||||
|
||||
### **Problem: Grafana shows "No Data"**
|
||||
```bash
|
||||
# Test Prometheus datasource
|
||||
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
|
||||
|
||||
# Run a test query in Prometheus
|
||||
curl "http://localhost:9090/api/v1/query?query=up" | jq
|
||||
|
||||
# If Prometheus has data but Grafana doesn't, check Grafana datasource config
|
||||
```
|
||||
|
||||
### **Problem: Alerts not firing**
|
||||
```bash
|
||||
# Check alert rules are loaded
|
||||
kubectl logs -n monitoring prometheus-0 | grep "Loading configuration"
|
||||
|
||||
# Check AlertManager config
|
||||
kubectl exec -n monitoring alertmanager-0 -- cat /etc/alertmanager/alertmanager.yml
|
||||
|
||||
# Test SMTP connection
|
||||
kubectl exec -n monitoring alertmanager-0 -- \
|
||||
nc -zv smtp.gmail.com 587
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📞 Need Help?
|
||||
|
||||
1. Check full documentation: [infrastructure/kubernetes/base/components/monitoring/README.md](infrastructure/kubernetes/base/components/monitoring/README.md)
|
||||
2. Review deployment summary: [MONITORING_DEPLOYMENT_SUMMARY.md](MONITORING_DEPLOYMENT_SUMMARY.md)
|
||||
3. Check Prometheus logs: `kubectl logs -n monitoring prometheus-0`
|
||||
4. Check AlertManager logs: `kubectl logs -n monitoring alertmanager-0`
|
||||
5. Check Grafana logs: `kubectl logs -n monitoring deployment/grafana`
|
||||
|
||||
---
|
||||
|
||||
## 🎉 You're Done!
|
||||
|
||||
Your monitoring stack is now running in production!
|
||||
|
||||
**Next steps:**
|
||||
1. Save your Grafana password securely
|
||||
2. Set up on-call rotation
|
||||
3. Review alert thresholds and adjust as needed
|
||||
4. Create team-specific dashboards
|
||||
5. Train team on using monitoring tools
|
||||
|
||||
**Access your monitoring:**
|
||||
- Grafana: https://monitoring.yourdomain.com/grafana
|
||||
- Prometheus: https://monitoring.yourdomain.com/prometheus
|
||||
- AlertManager: https://monitoring.yourdomain.com/alertmanager
|
||||
- Jaeger: https://monitoring.yourdomain.com/jaeger
|
||||
|
||||
---
|
||||
|
||||
*Deployment time: ~15 minutes*
|
||||
*Last updated: 2026-01-07*
|
||||
@@ -10,7 +10,7 @@ import resource
|
||||
import os
|
||||
from fastapi import FastAPI, Request, HTTPException, Depends, WebSocket, WebSocketDisconnect
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from fastapi.responses import JSONResponse, StreamingResponse, Response
|
||||
import httpx
|
||||
import time
|
||||
from shared.redis_utils import initialize_redis, close_redis, get_redis_client
|
||||
@@ -27,7 +27,42 @@ from app.middleware.demo_middleware import DemoMiddleware
|
||||
from app.middleware.read_only_mode import ReadOnlyModeMiddleware
|
||||
from app.routes import auth, tenant, notification, nominatim, subscription, demo, pos, geocoding, poi_context
|
||||
from shared.monitoring.logging import setup_logging
|
||||
from shared.monitoring.metrics import MetricsCollector
|
||||
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
|
||||
|
||||
# OpenTelemetry imports
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
|
||||
# Configure OpenTelemetry tracing
|
||||
def setup_tracing(service_name: str = "gateway"):
|
||||
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
|
||||
# Create resource with service name
|
||||
resource = Resource.create({"service.name": service_name})
|
||||
|
||||
# Configure OTLP exporter (sends to OpenTelemetry Collector)
|
||||
otlp_exporter = OTLPSpanExporter(
|
||||
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"),
|
||||
insecure=True # Use insecure connection for internal cluster communication
|
||||
)
|
||||
|
||||
# Configure tracer provider
|
||||
provider = TracerProvider(resource=resource)
|
||||
processor = BatchSpanProcessor(otlp_exporter)
|
||||
provider.add_span_processor(processor)
|
||||
|
||||
# Set global tracer provider
|
||||
trace.set_tracer_provider(provider)
|
||||
|
||||
return provider
|
||||
|
||||
# Initialize tracing
|
||||
tracer_provider = setup_tracing("gateway")
|
||||
|
||||
# Setup logging
|
||||
setup_logging("gateway", settings.LOG_LEVEL)
|
||||
@@ -75,9 +110,21 @@ app = FastAPI(
|
||||
redirect_slashes=False # Disable automatic trailing slash redirects
|
||||
)
|
||||
|
||||
# Instrument FastAPI with OpenTelemetry
|
||||
FastAPIInstrumentor.instrument_app(app)
|
||||
|
||||
# Instrument httpx for outgoing requests
|
||||
HTTPXClientInstrumentor().instrument()
|
||||
|
||||
# Instrument Redis (will be active once redis client is initialized)
|
||||
RedisInstrumentor().instrument()
|
||||
|
||||
# Initialize metrics collector
|
||||
metrics_collector = MetricsCollector("gateway")
|
||||
|
||||
# Add metrics middleware to track HTTP requests
|
||||
add_metrics_middleware(app, metrics_collector)
|
||||
|
||||
# Redis client for SSE streaming
|
||||
redis_client = None
|
||||
|
||||
@@ -182,8 +229,11 @@ async def health_check():
|
||||
|
||||
@app.get("/metrics")
|
||||
async def metrics():
|
||||
"""Metrics endpoint for monitoring"""
|
||||
return {"metrics": "enabled"}
|
||||
"""Prometheus metrics endpoint"""
|
||||
return Response(
|
||||
content=metrics_collector.get_metrics(),
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8"
|
||||
)
|
||||
|
||||
# ================================================================
|
||||
# SERVER-SENT EVENTS (SSE) HELPER FUNCTIONS
|
||||
|
||||
@@ -19,3 +19,9 @@ sqlalchemy==2.0.44
|
||||
asyncpg==0.30.0
|
||||
cryptography==44.0.0
|
||||
ortools==9.8.3296
|
||||
opentelemetry-api==1.27.0
|
||||
opentelemetry-sdk==1.27.0
|
||||
opentelemetry-instrumentation-fastapi==0.48b0
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.27.0
|
||||
opentelemetry-instrumentation-httpx==0.48b0
|
||||
opentelemetry-instrumentation-redis==0.48b0
|
||||
|
||||
@@ -1,201 +0,0 @@
|
||||
# Infrastructure Cleanup Summary
|
||||
|
||||
**Date:** 2026-01-07
|
||||
**Action:** Removed legacy Docker Compose infrastructure files
|
||||
|
||||
---
|
||||
|
||||
## Deleted Directories and Files
|
||||
|
||||
The following legacy infrastructure files have been removed as they were specific to Docker Compose deployment and are **not used** in the Kubernetes deployment:
|
||||
|
||||
### ❌ Removed:
|
||||
- `infrastructure/pgadmin/` - pgAdmin configuration for Docker Compose
|
||||
- `pgpass` - Password file
|
||||
- `servers.json` - Server definitions
|
||||
|
||||
- `infrastructure/postgres/` - PostgreSQL configuration for Docker Compose
|
||||
- `init-scripts/init.sql` - Database initialization
|
||||
|
||||
- `infrastructure/rabbitmq/` - RabbitMQ configuration for Docker Compose
|
||||
- `definitions.json` - Queue/exchange definitions
|
||||
- `rabbitmq.conf` - RabbitMQ settings
|
||||
|
||||
- `infrastructure/redis/` - Redis configuration for Docker Compose
|
||||
- `redis.conf` - Redis settings
|
||||
|
||||
- `infrastructure/terraform/` - Terraform infrastructure-as-code (unused)
|
||||
- `base/`, `dev/`, `staging/`, `production/` directories
|
||||
- `modules/` directory
|
||||
|
||||
- `infrastructure/rabbitmq.conf` - Standalone RabbitMQ config file
|
||||
|
||||
### ✅ Retained:
|
||||
|
||||
#### `infrastructure/kubernetes/`
|
||||
**Purpose:** Complete Kubernetes deployment manifests
|
||||
**Status:** Active and required
|
||||
**Contents:**
|
||||
- `base/` - Base Kubernetes resources
|
||||
- `components/` - All service deployments
|
||||
- `databases/` - Database deployments (uses embedded configs)
|
||||
- `monitoring/` - Prometheus, Grafana, AlertManager
|
||||
- `migrations/` - Database migration jobs
|
||||
- `secrets/` - TLS secrets and application secrets
|
||||
- `configmaps/` - PostgreSQL logging config
|
||||
- `overlays/` - Environment-specific configurations
|
||||
- `dev/` - Development overlay
|
||||
- `prod/` - Production overlay
|
||||
- `encryption/` - Kubernetes secrets encryption config
|
||||
|
||||
#### `infrastructure/tls/`
|
||||
**Purpose:** TLS/SSL certificates for database encryption
|
||||
**Status:** Active and required
|
||||
**Contents:**
|
||||
- `ca/` - Certificate Authority (10-year validity)
|
||||
- `ca-cert.pem` - CA certificate
|
||||
- `ca-key.pem` - CA private key (KEEP SECURE!)
|
||||
- `postgres/` - PostgreSQL server certificates (3-year validity)
|
||||
- `server-cert.pem`, `server-key.pem`, `ca-cert.pem`
|
||||
- `redis/` - Redis server certificates (3-year validity)
|
||||
- `redis-cert.pem`, `redis-key.pem`, `ca-cert.pem`
|
||||
- `generate-certificates.sh` - Certificate generation script
|
||||
|
||||
---
|
||||
|
||||
## Why These Were Removed
|
||||
|
||||
### Docker Compose vs Kubernetes
|
||||
|
||||
The removed files were configuration files for **Docker Compose** deployments:
|
||||
- pgAdmin was used for local database management (not needed in prod)
|
||||
- Standalone config files (rabbitmq.conf, redis.conf, postgres init scripts) were mounted as volumes in Docker Compose
|
||||
- Terraform was an unused infrastructure-as-code attempt
|
||||
|
||||
### Kubernetes Uses Different Approach
|
||||
|
||||
Kubernetes deployment uses:
|
||||
- **ConfigMaps** instead of config files
|
||||
- **Secrets** instead of environment files
|
||||
- **Kubernetes manifests** instead of docker-compose.yml
|
||||
- **Built-in orchestration** instead of Terraform
|
||||
|
||||
**Example:**
|
||||
```yaml
|
||||
# OLD (Docker Compose):
|
||||
volumes:
|
||||
- ./infrastructure/rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf
|
||||
|
||||
# NEW (Kubernetes):
|
||||
env:
|
||||
- name: RABBITMQ_DEFAULT_USER
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: rabbitmq-secrets
|
||||
key: RABBITMQ_USER
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Verification
|
||||
|
||||
### No References Found
|
||||
Searched entire codebase and confirmed **zero references** to removed folders:
|
||||
```bash
|
||||
grep -r "infrastructure/pgadmin" --include="*.yaml" --include="*.sh"
|
||||
# No results
|
||||
|
||||
grep -r "infrastructure/terraform" --include="*.yaml" --include="*.sh"
|
||||
# No results
|
||||
```
|
||||
|
||||
### Kubernetes Deployment Unaffected
|
||||
- All services use Kubernetes ConfigMaps and Secrets
|
||||
- Database configs embedded in deployment YAML files
|
||||
- TLS certificates managed via Kubernetes Secrets (from `infrastructure/tls/`)
|
||||
|
||||
---
|
||||
|
||||
## Current Infrastructure Structure
|
||||
|
||||
```
|
||||
infrastructure/
|
||||
├── kubernetes/ # ✅ ACTIVE - All K8s manifests
|
||||
│ ├── base/ # Base resources
|
||||
│ │ ├── components/ # Service deployments
|
||||
│ │ ├── secrets/ # TLS secrets
|
||||
│ │ ├── configmaps/ # Configuration
|
||||
│ │ └── kustomization.yaml # Base kustomization
|
||||
│ ├── overlays/ # Environment overlays
|
||||
│ │ ├── dev/ # Development
|
||||
│ │ └── prod/ # Production
|
||||
│ └── encryption/ # K8s secrets encryption
|
||||
└── tls/ # ✅ ACTIVE - TLS certificates
|
||||
├── ca/ # Certificate Authority
|
||||
├── postgres/ # PostgreSQL certs
|
||||
├── redis/ # Redis certs
|
||||
└── generate-certificates.sh
|
||||
|
||||
REMOVED (Docker Compose legacy):
|
||||
├── pgadmin/ # ❌ DELETED
|
||||
├── postgres/ # ❌ DELETED
|
||||
├── rabbitmq/ # ❌ DELETED
|
||||
├── redis/ # ❌ DELETED
|
||||
├── terraform/ # ❌ DELETED
|
||||
└── rabbitmq.conf # ❌ DELETED
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Impact Assessment
|
||||
|
||||
### ✅ No Breaking Changes
|
||||
- Kubernetes deployment unchanged
|
||||
- All services continue to work
|
||||
- TLS certificates still available
|
||||
- Production readiness maintained
|
||||
|
||||
### ✅ Benefits
|
||||
- Cleaner repository structure
|
||||
- Less confusion about which configs are used
|
||||
- Faster repository cloning (smaller size)
|
||||
- Clear separation: Kubernetes-only deployment
|
||||
|
||||
### ✅ Documentation Updated
|
||||
- [PILOT_LAUNCH_GUIDE.md](../docs/PILOT_LAUNCH_GUIDE.md) - Uses only Kubernetes
|
||||
- [PRODUCTION_OPERATIONS_GUIDE.md](../docs/PRODUCTION_OPERATIONS_GUIDE.md) - References only K8s resources
|
||||
- [infrastructure/kubernetes/README.md](kubernetes/README.md) - K8s-specific documentation
|
||||
|
||||
---
|
||||
|
||||
## Rollback (If Needed)
|
||||
|
||||
If for any reason you need these files back, they can be restored from git:
|
||||
|
||||
```bash
|
||||
# View deleted files
|
||||
git log --diff-filter=D --summary | grep infrastructure
|
||||
|
||||
# Restore specific folder (example)
|
||||
git checkout HEAD~1 -- infrastructure/pgadmin/
|
||||
|
||||
# Or restore all deleted infrastructure
|
||||
git checkout HEAD~1 -- infrastructure/
|
||||
```
|
||||
|
||||
**Note:** You won't need these for Kubernetes deployment. They were Docker Compose specific.
|
||||
|
||||
---
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [Kubernetes README](kubernetes/README.md) - K8s deployment guide
|
||||
- [TLS Configuration](../docs/tls-configuration.md) - Certificate management
|
||||
- [Database Security](../docs/database-security.md) - Database encryption
|
||||
- [Pilot Launch Guide](../docs/PILOT_LAUNCH_GUIDE.md) - Production deployment
|
||||
|
||||
---
|
||||
|
||||
**Cleanup Performed By:** Claude Code
|
||||
**Verified By:** Infrastructure analysis and grep searches
|
||||
**Status:** ✅ Complete - No issues found
|
||||
316
infrastructure/helm/signoz-values-dev.yaml
Normal file
316
infrastructure/helm/signoz-values-dev.yaml
Normal file
@@ -0,0 +1,316 @@
|
||||
# SigNoz Helm Chart Values - Development Environment
|
||||
# Optimized for local development with minimal resource usage
|
||||
#
|
||||
# Official Chart: https://github.com/SigNoz/charts
|
||||
# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-dev.yaml
|
||||
|
||||
global:
|
||||
storageClass: "standard"
|
||||
domain: "localhost"
|
||||
|
||||
# Frontend Configuration
|
||||
frontend:
|
||||
replicaCount: 1
|
||||
image:
|
||||
repository: signoz/frontend
|
||||
tag: 0.52.3
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 3301
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
className: nginx
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/rewrite-target: /$2
|
||||
nginx.ingress.kubernetes.io/use-regex: "true"
|
||||
hosts:
|
||||
- host: localhost
|
||||
paths:
|
||||
- path: /signoz(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
tls: []
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
|
||||
env:
|
||||
- name: FRONTEND_REFRESH_INTERVAL
|
||||
value: "30000"
|
||||
|
||||
# Query Service Configuration
|
||||
queryService:
|
||||
replicaCount: 1
|
||||
image:
|
||||
repository: signoz/query-service
|
||||
tag: 0.52.3
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 8080
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
env:
|
||||
- name: DEPLOYMENT_TYPE
|
||||
value: "kubernetes-helm"
|
||||
- name: SIGNOZ_LOCAL_DB_PATH
|
||||
value: "/var/lib/signoz"
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 5Gi
|
||||
storageClass: "standard"
|
||||
|
||||
# AlertManager Configuration
|
||||
alertmanager:
|
||||
replicaCount: 1
|
||||
image:
|
||||
repository: signoz/alertmanager
|
||||
tag: 0.23.5
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 9093
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 2Gi
|
||||
storageClass: "standard"
|
||||
|
||||
config:
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
route:
|
||||
group_by: ['alertname', 'cluster', 'service']
|
||||
group_wait: 10s
|
||||
group_interval: 10s
|
||||
repeat_interval: 12h
|
||||
receiver: 'default'
|
||||
receivers:
|
||||
- name: 'default'
|
||||
# Add email, slack, webhook configs here
|
||||
|
||||
# ClickHouse Configuration - Time Series Database
|
||||
clickhouse:
|
||||
replicaCount: 1
|
||||
image:
|
||||
repository: clickhouse/clickhouse-server
|
||||
tag: 24.1.2-alpine
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
httpPort: 8123
|
||||
tcpPort: 9000
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
storageClass: "standard"
|
||||
|
||||
# ClickHouse configuration
|
||||
config:
|
||||
logger:
|
||||
level: information
|
||||
max_connections: 1024
|
||||
max_concurrent_queries: 100
|
||||
# Data retention (7 days for dev)
|
||||
merge_tree:
|
||||
parts_to_delay_insert: 150
|
||||
parts_to_throw_insert: 300
|
||||
|
||||
# OpenTelemetry Collector - Integrated with SigNoz
|
||||
otelCollector:
|
||||
enabled: true
|
||||
replicaCount: 1
|
||||
image:
|
||||
repository: signoz/signoz-otel-collector
|
||||
tag: 0.102.8
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
otlpGrpc: 4317
|
||||
otlpHttp: 4318
|
||||
metrics: 8888
|
||||
healthCheck: 13133
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
# Full OTEL Collector Configuration
|
||||
config:
|
||||
extensions:
|
||||
health_check:
|
||||
endpoint: 0.0.0.0:13133
|
||||
zpages:
|
||||
endpoint: 0.0.0.0:55679
|
||||
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
cors:
|
||||
allowed_origins:
|
||||
- "http://localhost"
|
||||
- "https://localhost"
|
||||
|
||||
# Prometheus receiver for scraping metrics
|
||||
prometheus:
|
||||
config:
|
||||
scrape_configs:
|
||||
- job_name: 'otel-collector'
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ['localhost:8888']
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 10s
|
||||
send_batch_size: 1024
|
||||
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 400
|
||||
spike_limit_mib: 100
|
||||
|
||||
# Resource detection for K8s
|
||||
resourcedetection:
|
||||
detectors: [env, system, docker]
|
||||
timeout: 5s
|
||||
|
||||
# Add resource attributes
|
||||
resource:
|
||||
attributes:
|
||||
- key: deployment.environment
|
||||
value: development
|
||||
action: upsert
|
||||
|
||||
exporters:
|
||||
# Export to SigNoz ClickHouse
|
||||
clickhousetraces:
|
||||
datasource: tcp://clickhouse:9000/?database=signoz_traces
|
||||
timeout: 10s
|
||||
|
||||
clickhousemetricswrite:
|
||||
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
|
||||
timeout: 10s
|
||||
|
||||
clickhouselogsexporter:
|
||||
dsn: tcp://clickhouse:9000/?database=signoz_logs
|
||||
timeout: 10s
|
||||
|
||||
# Debug logging
|
||||
logging:
|
||||
loglevel: info
|
||||
sampling_initial: 5
|
||||
sampling_thereafter: 200
|
||||
|
||||
service:
|
||||
extensions: [health_check, zpages]
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch, resourcedetection, resource]
|
||||
exporters: [clickhousetraces, logging]
|
||||
|
||||
metrics:
|
||||
receivers: [otlp, prometheus]
|
||||
processors: [memory_limiter, batch, resourcedetection, resource]
|
||||
exporters: [clickhousemetricswrite]
|
||||
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch, resourcedetection, resource]
|
||||
exporters: [clickhouselogsexporter, logging]
|
||||
|
||||
# OpenTelemetry Collector Deployment Mode
|
||||
otelCollectorDeployment:
|
||||
enabled: true
|
||||
mode: deployment
|
||||
|
||||
# Node Exporter for infrastructure metrics (optional)
|
||||
nodeExporter:
|
||||
enabled: true
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 9100
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
|
||||
# Schemamanager - Manages ClickHouse schema
|
||||
schemamanager:
|
||||
enabled: true
|
||||
image:
|
||||
repository: signoz/signoz-schema-migrator
|
||||
tag: 0.52.3
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
# Additional Configuration
|
||||
serviceAccount:
|
||||
create: true
|
||||
annotations: {}
|
||||
name: ""
|
||||
|
||||
# Security Context
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
fsGroup: 1000
|
||||
|
||||
# Network Policies (disabled for dev)
|
||||
networkPolicy:
|
||||
enabled: false
|
||||
|
||||
# Monitoring SigNoz itself
|
||||
selfMonitoring:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: false
|
||||
471
infrastructure/helm/signoz-values-prod.yaml
Normal file
471
infrastructure/helm/signoz-values-prod.yaml
Normal file
@@ -0,0 +1,471 @@
|
||||
# SigNoz Helm Chart Values - Production Environment
|
||||
# High-availability configuration with resource optimization
|
||||
#
|
||||
# Official Chart: https://github.com/SigNoz/charts
|
||||
# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-prod.yaml
|
||||
|
||||
global:
|
||||
storageClass: "standard"
|
||||
domain: "monitoring.bakewise.ai"
|
||||
|
||||
# Frontend Configuration
|
||||
frontend:
|
||||
replicaCount: 2
|
||||
image:
|
||||
repository: signoz/frontend
|
||||
tag: 0.52.3
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 3301
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
className: nginx
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/rewrite-target: /$2
|
||||
nginx.ingress.kubernetes.io/use-regex: "true"
|
||||
cert-manager.io/cluster-issuer: "letsencrypt-prod"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
hosts:
|
||||
- host: monitoring.bakewise.ai
|
||||
paths:
|
||||
- path: /signoz(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
tls:
|
||||
- secretName: signoz-tls
|
||||
hosts:
|
||||
- monitoring.bakewise.ai
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
|
||||
# Pod Anti-affinity for HA
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- signoz-frontend
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
||||
env:
|
||||
- name: FRONTEND_REFRESH_INTERVAL
|
||||
value: "30000"
|
||||
|
||||
# Query Service Configuration
|
||||
queryService:
|
||||
replicaCount: 2
|
||||
image:
|
||||
repository: signoz/query-service
|
||||
tag: 0.52.3
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 8080
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
|
||||
# Pod Anti-affinity for HA
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- signoz-query-service
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
||||
env:
|
||||
- name: DEPLOYMENT_TYPE
|
||||
value: "kubernetes-helm"
|
||||
- name: SIGNOZ_LOCAL_DB_PATH
|
||||
value: "/var/lib/signoz"
|
||||
- name: RETENTION_DAYS
|
||||
value: "30"
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 20Gi
|
||||
storageClass: "standard"
|
||||
|
||||
# Horizontal Pod Autoscaler
|
||||
autoscaling:
|
||||
enabled: true
|
||||
minReplicas: 2
|
||||
maxReplicas: 5
|
||||
targetCPUUtilizationPercentage: 70
|
||||
targetMemoryUtilizationPercentage: 80
|
||||
|
||||
# AlertManager Configuration
|
||||
alertmanager:
|
||||
replicaCount: 2
|
||||
image:
|
||||
repository: signoz/alertmanager
|
||||
tag: 0.23.5
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 9093
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
|
||||
# Pod Anti-affinity for HA
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- signoz-alertmanager
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 5Gi
|
||||
storageClass: "standard"
|
||||
|
||||
config:
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
smtp_smarthost: 'smtp.gmail.com:587'
|
||||
smtp_from: 'alerts@bakewise.ai'
|
||||
smtp_auth_username: 'alerts@bakewise.ai'
|
||||
smtp_auth_password: '${SMTP_PASSWORD}'
|
||||
smtp_require_tls: true
|
||||
|
||||
route:
|
||||
group_by: ['alertname', 'cluster', 'service', 'severity']
|
||||
group_wait: 10s
|
||||
group_interval: 10s
|
||||
repeat_interval: 12h
|
||||
receiver: 'critical-alerts'
|
||||
routes:
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-alerts'
|
||||
continue: true
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'warning-alerts'
|
||||
|
||||
receivers:
|
||||
- name: 'critical-alerts'
|
||||
email_configs:
|
||||
- to: 'critical-alerts@bakewise.ai'
|
||||
headers:
|
||||
Subject: '[CRITICAL] {{ .GroupLabels.alertname }} - Bakery IA'
|
||||
# Slack webhook for critical alerts
|
||||
slack_configs:
|
||||
- api_url: '${SLACK_WEBHOOK_URL}'
|
||||
channel: '#alerts-critical'
|
||||
title: '[CRITICAL] {{ .GroupLabels.alertname }}'
|
||||
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
||||
|
||||
- name: 'warning-alerts'
|
||||
email_configs:
|
||||
- to: 'oncall@bakewise.ai'
|
||||
headers:
|
||||
Subject: '[WARNING] {{ .GroupLabels.alertname }} - Bakery IA'
|
||||
|
||||
# ClickHouse Configuration - Time Series Database
|
||||
clickhouse:
|
||||
replicaCount: 2
|
||||
image:
|
||||
repository: clickhouse/clickhouse-server
|
||||
tag: 24.1.2-alpine
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
httpPort: 8123
|
||||
tcpPort: 9000
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
limits:
|
||||
cpu: 2000m
|
||||
memory: 4Gi
|
||||
|
||||
# Pod Anti-affinity for HA
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
- labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- signoz-clickhouse
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 100Gi
|
||||
storageClass: "standard"
|
||||
|
||||
# ClickHouse configuration
|
||||
config:
|
||||
logger:
|
||||
level: information
|
||||
max_connections: 4096
|
||||
max_concurrent_queries: 500
|
||||
# Data retention (30 days for prod)
|
||||
merge_tree:
|
||||
parts_to_delay_insert: 150
|
||||
parts_to_throw_insert: 300
|
||||
# Performance tuning
|
||||
max_memory_usage: 10000000000
|
||||
max_bytes_before_external_group_by: 20000000000
|
||||
|
||||
# Backup configuration
|
||||
backup:
|
||||
enabled: true
|
||||
schedule: "0 2 * * *"
|
||||
retention: 7
|
||||
|
||||
# OpenTelemetry Collector - Integrated with SigNoz
|
||||
otelCollector:
|
||||
enabled: true
|
||||
replicaCount: 2
|
||||
image:
|
||||
repository: signoz/signoz-otel-collector
|
||||
tag: 0.102.8
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
otlpGrpc: 4317
|
||||
otlpHttp: 4318
|
||||
metrics: 8888
|
||||
healthCheck: 13133
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
|
||||
# Full OTEL Collector Configuration
|
||||
config:
|
||||
extensions:
|
||||
health_check:
|
||||
endpoint: 0.0.0.0:13133
|
||||
zpages:
|
||||
endpoint: 0.0.0.0:55679
|
||||
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
max_recv_msg_size_mib: 16
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
cors:
|
||||
allowed_origins:
|
||||
- "https://monitoring.bakewise.ai"
|
||||
- "https://*.bakewise.ai"
|
||||
|
||||
# Prometheus receiver for scraping metrics
|
||||
prometheus:
|
||||
config:
|
||||
scrape_configs:
|
||||
- job_name: 'otel-collector'
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ['localhost:8888']
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 10s
|
||||
send_batch_size: 2048
|
||||
send_batch_max_size: 4096
|
||||
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 800
|
||||
spike_limit_mib: 200
|
||||
|
||||
# Resource detection for K8s
|
||||
resourcedetection:
|
||||
detectors: [env, system, docker]
|
||||
timeout: 5s
|
||||
|
||||
# Add resource attributes
|
||||
resource:
|
||||
attributes:
|
||||
- key: deployment.environment
|
||||
value: production
|
||||
action: upsert
|
||||
- key: cluster.name
|
||||
value: bakery-ia-prod
|
||||
action: upsert
|
||||
|
||||
exporters:
|
||||
# Export to SigNoz ClickHouse
|
||||
clickhousetraces:
|
||||
datasource: tcp://clickhouse:9000/?database=signoz_traces
|
||||
timeout: 10s
|
||||
retry_on_failure:
|
||||
enabled: true
|
||||
initial_interval: 5s
|
||||
max_interval: 30s
|
||||
max_elapsed_time: 300s
|
||||
|
||||
clickhousemetricswrite:
|
||||
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
|
||||
timeout: 10s
|
||||
retry_on_failure:
|
||||
enabled: true
|
||||
initial_interval: 5s
|
||||
max_interval: 30s
|
||||
max_elapsed_time: 300s
|
||||
|
||||
clickhouselogsexporter:
|
||||
dsn: tcp://clickhouse:9000/?database=signoz_logs
|
||||
timeout: 10s
|
||||
retry_on_failure:
|
||||
enabled: true
|
||||
initial_interval: 5s
|
||||
max_interval: 30s
|
||||
max_elapsed_time: 300s
|
||||
|
||||
# Minimal logging for prod
|
||||
logging:
|
||||
loglevel: warn
|
||||
sampling_initial: 2
|
||||
sampling_thereafter: 500
|
||||
|
||||
service:
|
||||
extensions: [health_check, zpages]
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch, resourcedetection, resource]
|
||||
exporters: [clickhousetraces, logging]
|
||||
|
||||
metrics:
|
||||
receivers: [otlp, prometheus]
|
||||
processors: [memory_limiter, batch, resourcedetection, resource]
|
||||
exporters: [clickhousemetricswrite]
|
||||
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch, resourcedetection, resource]
|
||||
exporters: [clickhouselogsexporter, logging]
|
||||
|
||||
# OpenTelemetry Collector Deployment Mode
|
||||
otelCollectorDeployment:
|
||||
enabled: true
|
||||
mode: deployment
|
||||
|
||||
# HPA for OTEL Collector
|
||||
autoscaling:
|
||||
enabled: true
|
||||
minReplicas: 2
|
||||
maxReplicas: 10
|
||||
targetCPUUtilizationPercentage: 70
|
||||
targetMemoryUtilizationPercentage: 80
|
||||
|
||||
# Node Exporter for infrastructure metrics
|
||||
nodeExporter:
|
||||
enabled: true
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 9100
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
|
||||
# Schemamanager - Manages ClickHouse schema
|
||||
schemamanager:
|
||||
enabled: true
|
||||
image:
|
||||
repository: signoz/signoz-schema-migrator
|
||||
tag: 0.52.3
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
# Additional Configuration
|
||||
serviceAccount:
|
||||
create: true
|
||||
annotations: {}
|
||||
name: "signoz"
|
||||
|
||||
# Security Context
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
fsGroup: 1000
|
||||
|
||||
# Pod Disruption Budgets for HA
|
||||
podDisruptionBudget:
|
||||
frontend:
|
||||
enabled: true
|
||||
minAvailable: 1
|
||||
queryService:
|
||||
enabled: true
|
||||
minAvailable: 1
|
||||
alertmanager:
|
||||
enabled: true
|
||||
minAvailable: 1
|
||||
clickhouse:
|
||||
enabled: true
|
||||
minAvailable: 1
|
||||
|
||||
# Network Policies for security
|
||||
networkPolicy:
|
||||
enabled: true
|
||||
policyTypes:
|
||||
- Ingress
|
||||
- Egress
|
||||
|
||||
# Monitoring SigNoz itself
|
||||
selfMonitoring:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
interval: 30s
|
||||
@@ -4,7 +4,7 @@ This directory contains Kubernetes manifests for deploying the Bakery IA platfor
|
||||
|
||||
## Quick Start
|
||||
|
||||
Deploy the entire platform with these 5 commands:
|
||||
Deploy the entire platform with these 4 commands:
|
||||
|
||||
```bash
|
||||
# 1. Start Colima with adequate resources
|
||||
@@ -17,15 +17,14 @@ kind create cluster --config kind-config.yaml
|
||||
kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml
|
||||
kubectl wait --namespace ingress-nginx --for=condition=ready pod --selector=app.kubernetes.io/component=controller --timeout=300s
|
||||
|
||||
# 4. Configure permanent localhost access
|
||||
kubectl patch svc ingress-nginx-controller -n ingress-nginx -p '{"spec":{"type":"NodePort","ports":[{"name":"http","port":80,"targetPort":"http","nodePort":30080},{"name":"https","port":443,"targetPort":"https","nodePort":30443}]}}'
|
||||
# 4. Deploy with Tilt
|
||||
tilt up
|
||||
|
||||
# 5. Deploy with Skaffold
|
||||
skaffold dev --profile=dev
|
||||
|
||||
# 🎉 Access at: https://localhost
|
||||
# 🎉 Access at: http://localhost (or see Tilt for individual service ports)
|
||||
```
|
||||
|
||||
> **Note**: The kind-config.yaml already configures port mappings (30080→80, 30443→443) for localhost access, so no additional service patching is needed. The NGINX Ingress for Kind uses NodePort by default on those exact ports.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Install the following tools on macOS:
|
||||
@@ -100,11 +99,11 @@ Then access via:
|
||||
|
||||
### Start Development Environment
|
||||
```bash
|
||||
# Start development mode with hot-reload
|
||||
skaffold dev --profile=dev
|
||||
# Start development mode with hot-reload using Tilt
|
||||
tilt up
|
||||
|
||||
# Or one-time deployment
|
||||
skaffold run --profile=dev
|
||||
# Or start in background
|
||||
tilt up --stream
|
||||
```
|
||||
|
||||
### Key Features
|
||||
@@ -246,13 +245,39 @@ colima stop --profile k8s-local
|
||||
|
||||
### Restart Sequence
|
||||
```bash
|
||||
# Post-restart startup
|
||||
# Post-restart startup (or use kubernetes_restart.sh script)
|
||||
colima start --cpu 6 --memory 12 --disk 120 --runtime docker --profile k8s-local
|
||||
kind create cluster --config kind-config.yaml
|
||||
skaffold dev --profile=dev
|
||||
kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml
|
||||
kubectl wait --namespace ingress-nginx --for=condition=ready pod --selector=app.kubernetes.io/component=controller --timeout=300s
|
||||
tilt up
|
||||
```
|
||||
|
||||
## Production Considerations
|
||||
## Production Deployment
|
||||
|
||||
### Production URLs
|
||||
|
||||
The production environment uses the following domains:
|
||||
|
||||
- **Main Application**: https://bakewise.ai
|
||||
- Frontend application and all public pages
|
||||
- API endpoints: https://bakewise.ai/api/v1/...
|
||||
|
||||
- **Monitoring Stack**: https://monitoring.bakewise.ai
|
||||
- Grafana: https://monitoring.bakewise.ai/grafana
|
||||
- Prometheus: https://monitoring.bakewise.ai/prometheus
|
||||
- Jaeger: https://monitoring.bakewise.ai/jaeger
|
||||
- AlertManager: https://monitoring.bakewise.ai/alertmanager
|
||||
|
||||
### Production Configuration
|
||||
|
||||
The production overlay (`overlays/prod/`) includes:
|
||||
- **Domain Configuration**: bakewise.ai with Let's Encrypt certificates
|
||||
- **High Availability**: Multi-replica deployments (2-3 replicas per service)
|
||||
- **Enhanced Security**: Rate limiting, CORS restrictions, security headers
|
||||
- **Monitoring**: Full observability stack with Prometheus, Grafana, Jaeger
|
||||
|
||||
### Production Considerations
|
||||
|
||||
For production deployment:
|
||||
|
||||
@@ -263,6 +288,7 @@ For production deployment:
|
||||
- **External Secrets**: Use managed secret services
|
||||
- **TLS**: Production Let's Encrypt certificates
|
||||
- **CI/CD**: Automated deployment pipelines
|
||||
- **DNS**: Configure DNS A/CNAME records pointing to your cluster's load balancer
|
||||
|
||||
## Next Steps
|
||||
|
||||
|
||||
@@ -48,6 +48,9 @@ spec:
|
||||
name: pos-integration-secrets
|
||||
- secretRef:
|
||||
name: whatsapp-secrets
|
||||
env:
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://otel-collector.monitoring.svc.cluster.local:4317"
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
|
||||
@@ -1,429 +0,0 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-alert-rules
|
||||
namespace: monitoring
|
||||
data:
|
||||
alert-rules.yml: |
|
||||
groups:
|
||||
# Basic Infrastructure Alerts
|
||||
- name: bakery_services
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: ServiceDown
|
||||
expr: up{job="bakery-services"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: infrastructure
|
||||
annotations:
|
||||
summary: "Service {{ $labels.service }} is down"
|
||||
description: "Service {{ $labels.service }} in namespace {{ $labels.namespace }} has been down for more than 2 minutes."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/ServiceDown"
|
||||
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_requests_total{status_code=~"5..", job="bakery-services"}[5m])) by (service)
|
||||
/
|
||||
sum(rate(http_requests_total{job="bakery-services"}[5m])) by (service)
|
||||
) > 0.10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
component: application
|
||||
annotations:
|
||||
summary: "High error rate on {{ $labels.service }}"
|
||||
description: "Service {{ $labels.service }} has error rate above 10% (current: {{ $value | humanizePercentage }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/HighErrorRate"
|
||||
|
||||
- alert: HighResponseTime
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_request_duration_seconds_bucket{job="bakery-services"}[5m])) by (service, le)
|
||||
) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: performance
|
||||
annotations:
|
||||
summary: "High response time on {{ $labels.service }}"
|
||||
description: "Service {{ $labels.service }} P95 latency is above 1 second (current: {{ $value }}s)."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/HighResponseTime"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: |
|
||||
container_memory_usage_bytes{namespace="bakery-ia", container!=""} > 500000000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: infrastructure
|
||||
annotations:
|
||||
summary: "High memory usage in {{ $labels.pod }}"
|
||||
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using more than 500MB of memory (current: {{ $value | humanize }}B)."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/HighMemoryUsage"
|
||||
|
||||
- alert: DatabaseConnectionHigh
|
||||
expr: |
|
||||
pg_stat_database_numbackends{datname="bakery"} > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: database
|
||||
annotations:
|
||||
summary: "High database connection count"
|
||||
description: "Database has more than 80 active connections (current: {{ $value }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/DatabaseConnectionHigh"
|
||||
|
||||
# Business Logic Alerts
|
||||
- name: bakery_business
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: TrainingJobFailed
|
||||
expr: |
|
||||
increase(training_job_failures_total[1h]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: ml-training
|
||||
annotations:
|
||||
summary: "Training job failures detected"
|
||||
description: "{{ $value }} training job(s) failed in the last hour."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/TrainingJobFailed"
|
||||
|
||||
- alert: LowPredictionAccuracy
|
||||
expr: |
|
||||
prediction_model_accuracy < 0.70
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
component: ml-inference
|
||||
annotations:
|
||||
summary: "Model prediction accuracy is low"
|
||||
description: "Model {{ $labels.model_name }} accuracy is below 70% (current: {{ $value | humanizePercentage }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/LowPredictionAccuracy"
|
||||
|
||||
- alert: APIRateLimitHit
|
||||
expr: |
|
||||
increase(rate_limit_hits_total[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
component: api-gateway
|
||||
annotations:
|
||||
summary: "API rate limits being hit frequently"
|
||||
description: "Rate limits hit {{ $value }} times in the last 5 minutes."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/APIRateLimitHit"
|
||||
|
||||
# Alert System Health
|
||||
- name: alert_system_health
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: AlertSystemComponentDown
|
||||
expr: |
|
||||
alert_system_component_health{component=~"processor|notifier|scheduler"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Alert system component {{ $labels.component }} is unhealthy"
|
||||
description: "Component {{ $labels.component }} has been unhealthy for more than 2 minutes."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/AlertSystemComponentDown"
|
||||
|
||||
- alert: RabbitMQConnectionDown
|
||||
expr: |
|
||||
rabbitmq_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "RabbitMQ connection is down"
|
||||
description: "Alert system has lost connection to RabbitMQ message queue."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/RabbitMQConnectionDown"
|
||||
|
||||
- alert: RedisConnectionDown
|
||||
expr: |
|
||||
redis_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Redis connection is down"
|
||||
description: "Alert system has lost connection to Redis cache."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/RedisConnectionDown"
|
||||
|
||||
- alert: NoSchedulerLeader
|
||||
expr: |
|
||||
sum(alert_system_scheduler_leader) == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "No alert scheduler leader elected"
|
||||
description: "No scheduler instance has been elected as leader for 5 minutes."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/NoSchedulerLeader"
|
||||
|
||||
# Alert System Performance
|
||||
- name: alert_system_performance
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighAlertProcessingErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(alert_processing_errors_total[2m]))
|
||||
/
|
||||
sum(rate(alerts_processed_total[2m]))
|
||||
) > 0.10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "High alert processing error rate"
|
||||
description: "Alert processing error rate is above 10% (current: {{ $value | humanizePercentage }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingErrorRate"
|
||||
|
||||
- alert: HighNotificationDeliveryFailureRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(notification_delivery_failures_total[3m]))
|
||||
/
|
||||
sum(rate(notifications_sent_total[3m]))
|
||||
) > 0.05
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "High notification delivery failure rate"
|
||||
description: "Notification delivery failure rate is above 5% (current: {{ $value | humanizePercentage }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/HighNotificationDeliveryFailureRate"
|
||||
|
||||
- alert: HighAlertProcessingLatency
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(alert_processing_duration_seconds_bucket[5m])) by (le)
|
||||
) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "High alert processing latency"
|
||||
description: "P95 alert processing latency is above 5 seconds (current: {{ $value }}s)."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/HighAlertProcessingLatency"
|
||||
|
||||
- alert: TooManySSEConnections
|
||||
expr: |
|
||||
sse_active_connections > 1000
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Too many active SSE connections"
|
||||
description: "More than 1000 active SSE connections (current: {{ $value }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/TooManySSEConnections"
|
||||
|
||||
- alert: SSEConnectionErrors
|
||||
expr: |
|
||||
rate(sse_connection_errors_total[3m]) > 0.5
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "High rate of SSE connection errors"
|
||||
description: "SSE connection error rate is {{ $value }} errors/sec."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/SSEConnectionErrors"
|
||||
|
||||
# Alert System Business Logic
|
||||
- name: alert_system_business
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: UnusuallyHighAlertVolume
|
||||
expr: |
|
||||
rate(alerts_generated_total[5m]) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Unusually high alert generation volume"
|
||||
description: "More than 2 alerts per second being generated (current: {{ $value }}/sec)."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/UnusuallyHighAlertVolume"
|
||||
|
||||
- alert: NoAlertsGenerated
|
||||
expr: |
|
||||
rate(alerts_generated_total[30m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "No alerts generated recently"
|
||||
description: "No alerts have been generated in the last 30 minutes. This might indicate a problem with alert detection."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/NoAlertsGenerated"
|
||||
|
||||
- alert: SlowAlertResponseTime
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(alert_response_time_seconds_bucket[10m])) by (le)
|
||||
) > 3600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Slow alert response times"
|
||||
description: "P95 alert response time is above 1 hour (current: {{ $value | humanizeDuration }})."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/SlowAlertResponseTime"
|
||||
|
||||
- alert: CriticalAlertsUnacknowledged
|
||||
expr: |
|
||||
sum(alerts_unacknowledged{severity="critical"}) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Multiple critical alerts unacknowledged"
|
||||
description: "{{ $value }} critical alerts have not been acknowledged for 10+ minutes."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/CriticalAlertsUnacknowledged"
|
||||
|
||||
# Alert System Capacity
|
||||
- name: alert_system_capacity
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: LargeSSEMessageQueues
|
||||
expr: |
|
||||
sse_message_queue_size > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Large SSE message queues detected"
|
||||
description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages queued."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/LargeSSEMessageQueues"
|
||||
|
||||
- alert: SlowDatabaseStorage
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(alert_storage_duration_seconds_bucket[5m])) by (le)
|
||||
) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Slow alert database storage"
|
||||
description: "P95 alert storage latency is above 1 second (current: {{ $value }}s)."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/SlowDatabaseStorage"
|
||||
|
||||
# Alert System Critical Scenarios
|
||||
- name: alert_system_critical
|
||||
interval: 15s
|
||||
rules:
|
||||
- alert: AlertSystemDown
|
||||
expr: |
|
||||
up{service=~"alert-processor|notification-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Alert system is completely down"
|
||||
description: "Core alert system service {{ $labels.service }} is down."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/AlertSystemDown"
|
||||
|
||||
- alert: AlertDataNotPersisted
|
||||
expr: |
|
||||
(
|
||||
sum(rate(alerts_processed_total[2m]))
|
||||
-
|
||||
sum(rate(alerts_stored_total[2m]))
|
||||
) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Alerts not being persisted to database"
|
||||
description: "Alerts are being processed but not stored in the database."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/AlertDataNotPersisted"
|
||||
|
||||
- alert: NotificationsNotDelivered
|
||||
expr: |
|
||||
(
|
||||
sum(rate(alerts_processed_total[3m]))
|
||||
-
|
||||
sum(rate(notifications_sent_total[3m]))
|
||||
) > 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
component: alert-system
|
||||
annotations:
|
||||
summary: "Notifications not being delivered"
|
||||
description: "Alerts are being processed but notifications are not being sent."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/NotificationsNotDelivered"
|
||||
|
||||
# Monitoring System Self-Monitoring
|
||||
- name: monitoring_health
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: PrometheusDown
|
||||
expr: up{job="prometheus"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
component: monitoring
|
||||
annotations:
|
||||
summary: "Prometheus is down"
|
||||
description: "Prometheus monitoring system is not responding."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/PrometheusDown"
|
||||
|
||||
- alert: AlertManagerDown
|
||||
expr: up{job="alertmanager"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: monitoring
|
||||
annotations:
|
||||
summary: "AlertManager is down"
|
||||
description: "AlertManager is not responding. Alerts will not be routed."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/AlertManagerDown"
|
||||
|
||||
- alert: PrometheusStorageFull
|
||||
expr: |
|
||||
(
|
||||
prometheus_tsdb_storage_blocks_bytes
|
||||
/
|
||||
(prometheus_tsdb_storage_blocks_bytes + prometheus_tsdb_wal_size_bytes)
|
||||
) > 0.90
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
component: monitoring
|
||||
annotations:
|
||||
summary: "Prometheus storage almost full"
|
||||
description: "Prometheus storage is {{ $value | humanizePercentage }} full."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/PrometheusStorageFull"
|
||||
|
||||
- alert: PrometheusScrapeErrors
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_exceeded_sample_limit_total[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: monitoring
|
||||
annotations:
|
||||
summary: "Prometheus scrape errors detected"
|
||||
description: "Prometheus is experiencing scrape errors for target {{ $labels.job }}."
|
||||
runbook_url: "https://runbooks.bakery-ia.local/PrometheusScrapeErrors"
|
||||
@@ -1,27 +0,0 @@
|
||||
---
|
||||
# InitContainer to substitute secrets into AlertManager config
|
||||
# This allows us to use environment variables from secrets in the config file
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alertmanager-init-script
|
||||
namespace: monitoring
|
||||
data:
|
||||
init-config.sh: |
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
# Read the template config
|
||||
TEMPLATE=$(cat /etc/alertmanager-template/alertmanager.yml)
|
||||
|
||||
# Substitute environment variables
|
||||
echo "$TEMPLATE" | \
|
||||
sed "s|{{ .smtp_host }}|${SMTP_HOST}|g" | \
|
||||
sed "s|{{ .smtp_from }}|${SMTP_FROM}|g" | \
|
||||
sed "s|{{ .smtp_username }}|${SMTP_USERNAME}|g" | \
|
||||
sed "s|{{ .smtp_password }}|${SMTP_PASSWORD}|g" | \
|
||||
sed "s|{{ .slack_webhook_url }}|${SLACK_WEBHOOK_URL}|g" \
|
||||
> /etc/alertmanager-final/alertmanager.yml
|
||||
|
||||
echo "AlertManager config initialized successfully"
|
||||
cat /etc/alertmanager-final/alertmanager.yml
|
||||
@@ -1,391 +0,0 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alertmanager-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
alertmanager.yml: |
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
smtp_smarthost: '{{ .smtp_host }}'
|
||||
smtp_from: '{{ .smtp_from }}'
|
||||
smtp_auth_username: '{{ .smtp_username }}'
|
||||
smtp_auth_password: '{{ .smtp_password }}'
|
||||
smtp_require_tls: true
|
||||
|
||||
# Define notification templates
|
||||
templates:
|
||||
- '/etc/alertmanager/templates/*.tmpl'
|
||||
|
||||
# Route alerts to appropriate receivers
|
||||
route:
|
||||
# Default receiver
|
||||
receiver: 'default-email'
|
||||
# Group alerts by these labels
|
||||
group_by: ['alertname', 'cluster', 'service']
|
||||
# Wait time before sending initial notification
|
||||
group_wait: 10s
|
||||
# Wait time before sending notifications about new alerts in the group
|
||||
group_interval: 10s
|
||||
# Wait time before re-sending a notification
|
||||
repeat_interval: 12h
|
||||
|
||||
# Child routes for specific alert routing
|
||||
routes:
|
||||
# Critical alerts - send immediately to all channels
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-alerts'
|
||||
group_wait: 0s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
continue: true
|
||||
|
||||
# Warning alerts - less urgent
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'warning-alerts'
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
|
||||
# Alert system specific alerts
|
||||
- match:
|
||||
component: alert-system
|
||||
receiver: 'alert-system-team'
|
||||
group_wait: 10s
|
||||
repeat_interval: 6h
|
||||
|
||||
# Database alerts
|
||||
- match_re:
|
||||
alertname: ^(DatabaseConnectionHigh|SlowDatabaseStorage)$
|
||||
receiver: 'database-team'
|
||||
group_wait: 30s
|
||||
repeat_interval: 8h
|
||||
|
||||
# Infrastructure alerts
|
||||
- match_re:
|
||||
alertname: ^(HighMemoryUsage|ServiceDown)$
|
||||
receiver: 'infra-team'
|
||||
group_wait: 30s
|
||||
repeat_interval: 6h
|
||||
|
||||
# Inhibition rules - prevent alert spam
|
||||
inhibit_rules:
|
||||
# If service is down, inhibit all other alerts for that service
|
||||
- source_match:
|
||||
alertname: 'ServiceDown'
|
||||
target_match_re:
|
||||
alertname: '(HighErrorRate|HighResponseTime|HighMemoryUsage)'
|
||||
equal: ['service']
|
||||
|
||||
# If AlertSystem is completely down, inhibit component alerts
|
||||
- source_match:
|
||||
alertname: 'AlertSystemDown'
|
||||
target_match_re:
|
||||
alertname: 'AlertSystemComponent.*'
|
||||
equal: ['namespace']
|
||||
|
||||
# If RabbitMQ is down, inhibit alert processing errors
|
||||
- source_match:
|
||||
alertname: 'RabbitMQConnectionDown'
|
||||
target_match:
|
||||
alertname: 'HighAlertProcessingErrorRate'
|
||||
equal: ['namespace']
|
||||
|
||||
# Receivers - notification destinations
|
||||
receivers:
|
||||
# Default email receiver
|
||||
- name: 'default-email'
|
||||
email_configs:
|
||||
- to: 'alerts@yourdomain.com'
|
||||
headers:
|
||||
Subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
|
||||
html: |
|
||||
{{ range .Alerts }}
|
||||
<h2>{{ .Labels.alertname }}</h2>
|
||||
<p><strong>Status:</strong> {{ .Status }}</p>
|
||||
<p><strong>Severity:</strong> {{ .Labels.severity }}</p>
|
||||
<p><strong>Service:</strong> {{ .Labels.service }}</p>
|
||||
<p><strong>Summary:</strong> {{ .Annotations.summary }}</p>
|
||||
<p><strong>Description:</strong> {{ .Annotations.description }}</p>
|
||||
<p><strong>Started:</strong> {{ .StartsAt }}</p>
|
||||
{{ if .EndsAt }}<p><strong>Ended:</strong> {{ .EndsAt }}</p>{{ end }}
|
||||
{{ end }}
|
||||
|
||||
# Critical alerts - multiple channels
|
||||
- name: 'critical-alerts'
|
||||
email_configs:
|
||||
- to: 'critical-alerts@yourdomain.com,oncall@yourdomain.com'
|
||||
headers:
|
||||
Subject: '🚨 [CRITICAL] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
|
||||
send_resolved: true
|
||||
# Uncomment to enable Slack notifications
|
||||
# slack_configs:
|
||||
# - api_url: '{{ .slack_webhook_url }}'
|
||||
# channel: '#alerts-critical'
|
||||
# title: '🚨 Critical Alert'
|
||||
# text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
||||
# send_resolved: true
|
||||
|
||||
# Warning alerts
|
||||
- name: 'warning-alerts'
|
||||
email_configs:
|
||||
- to: 'alerts@yourdomain.com'
|
||||
headers:
|
||||
Subject: '⚠️ [WARNING] {{ .GroupLabels.alertname }} - {{ .GroupLabels.service }}'
|
||||
send_resolved: true
|
||||
|
||||
# Alert system team
|
||||
- name: 'alert-system-team'
|
||||
email_configs:
|
||||
- to: 'alert-system-team@yourdomain.com'
|
||||
headers:
|
||||
Subject: '[Alert System] {{ .GroupLabels.alertname }}'
|
||||
send_resolved: true
|
||||
|
||||
# Database team
|
||||
- name: 'database-team'
|
||||
email_configs:
|
||||
- to: 'database-team@yourdomain.com'
|
||||
headers:
|
||||
Subject: '[Database] {{ .GroupLabels.alertname }}'
|
||||
send_resolved: true
|
||||
|
||||
# Infrastructure team
|
||||
- name: 'infra-team'
|
||||
email_configs:
|
||||
- to: 'infra-team@yourdomain.com'
|
||||
headers:
|
||||
Subject: '[Infrastructure] {{ .GroupLabels.alertname }}'
|
||||
send_resolved: true
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alertmanager-templates
|
||||
namespace: monitoring
|
||||
data:
|
||||
default.tmpl: |
|
||||
{{ define "cluster" }}{{ .ExternalURL | reReplaceAll ".*alertmanager\\.(.*)" "$1" }}{{ end }}
|
||||
|
||||
{{ define "slack.default.title" }}
|
||||
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.alertname }}
|
||||
{{ end }}
|
||||
|
||||
{{ define "slack.default.text" }}
|
||||
{{ range .Alerts }}
|
||||
*Alert:* {{ .Annotations.summary }}
|
||||
*Description:* {{ .Annotations.description }}
|
||||
*Severity:* `{{ .Labels.severity }}`
|
||||
*Service:* `{{ .Labels.service }}`
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: alertmanager
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
serviceName: alertmanager
|
||||
replicas: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: alertmanager
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
serviceAccountName: prometheus
|
||||
initContainers:
|
||||
- name: init-config
|
||||
image: busybox:1.36
|
||||
command: ['/bin/sh', '/scripts/init-config.sh']
|
||||
env:
|
||||
- name: SMTP_HOST
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: smtp-host
|
||||
- name: SMTP_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: smtp-username
|
||||
- name: SMTP_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: smtp-password
|
||||
- name: SMTP_FROM
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: smtp-from
|
||||
- name: SLACK_WEBHOOK_URL
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: alertmanager-secrets
|
||||
key: slack-webhook-url
|
||||
optional: true
|
||||
volumeMounts:
|
||||
- name: init-script
|
||||
mountPath: /scripts
|
||||
- name: config-template
|
||||
mountPath: /etc/alertmanager-template
|
||||
- name: config-final
|
||||
mountPath: /etc/alertmanager-final
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- alertmanager
|
||||
topologyKey: kubernetes.io/hostname
|
||||
containers:
|
||||
- name: alertmanager
|
||||
image: prom/alertmanager:v0.27.0
|
||||
args:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--cluster.listen-address=0.0.0.0:9094'
|
||||
- '--cluster.peer=alertmanager-0.alertmanager.monitoring.svc.cluster.local:9094'
|
||||
- '--cluster.peer=alertmanager-1.alertmanager.monitoring.svc.cluster.local:9094'
|
||||
- '--cluster.peer=alertmanager-2.alertmanager.monitoring.svc.cluster.local:9094'
|
||||
- '--cluster.reconnect-timeout=5m'
|
||||
- '--web.external-url=http://monitoring.bakery-ia.local/alertmanager'
|
||||
- '--web.route-prefix=/'
|
||||
ports:
|
||||
- name: web
|
||||
containerPort: 9093
|
||||
- name: mesh-tcp
|
||||
containerPort: 9094
|
||||
- name: mesh-udp
|
||||
containerPort: 9094
|
||||
protocol: UDP
|
||||
env:
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
volumeMounts:
|
||||
- name: config-final
|
||||
mountPath: /etc/alertmanager
|
||||
- name: templates
|
||||
mountPath: /etc/alertmanager/templates
|
||||
- name: storage
|
||||
mountPath: /alertmanager
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "256Mi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /-/healthy
|
||||
port: 9093
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /-/ready
|
||||
port: 9093
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
|
||||
# Config reloader sidecar
|
||||
- name: configmap-reload
|
||||
image: jimmidyson/configmap-reload:v0.12.0
|
||||
args:
|
||||
- '--webhook-url=http://localhost:9093/-/reload'
|
||||
- '--volume-dir=/etc/alertmanager'
|
||||
volumeMounts:
|
||||
- name: config-final
|
||||
mountPath: /etc/alertmanager
|
||||
readOnly: true
|
||||
resources:
|
||||
requests:
|
||||
memory: "16Mi"
|
||||
cpu: "10m"
|
||||
limits:
|
||||
memory: "32Mi"
|
||||
cpu: "50m"
|
||||
|
||||
volumes:
|
||||
- name: init-script
|
||||
configMap:
|
||||
name: alertmanager-init-script
|
||||
defaultMode: 0755
|
||||
- name: config-template
|
||||
configMap:
|
||||
name: alertmanager-config
|
||||
- name: config-final
|
||||
emptyDir: {}
|
||||
- name: templates
|
||||
configMap:
|
||||
name: alertmanager-templates
|
||||
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: storage
|
||||
spec:
|
||||
accessModes: [ "ReadWriteOnce" ]
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: alertmanager
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: web
|
||||
port: 9093
|
||||
targetPort: 9093
|
||||
- name: mesh-tcp
|
||||
port: 9094
|
||||
targetPort: 9094
|
||||
- name: mesh-udp
|
||||
port: 9094
|
||||
targetPort: 9094
|
||||
protocol: UDP
|
||||
selector:
|
||||
app: alertmanager
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: alertmanager-external
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: alertmanager
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: web
|
||||
port: 9093
|
||||
targetPort: 9093
|
||||
selector:
|
||||
app: alertmanager
|
||||
@@ -1,949 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboards-extended
|
||||
namespace: monitoring
|
||||
data:
|
||||
postgresql-dashboard.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - PostgreSQL Database",
|
||||
"tags": ["bakery-ia", "postgresql", "database"],
|
||||
"timezone": "browser",
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Active Connections by Database",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_stat_activity_count{state=\"active\"}",
|
||||
"legendFormat": "{{datname}} - active"
|
||||
},
|
||||
{
|
||||
"expr": "pg_stat_activity_count{state=\"idle\"}",
|
||||
"legendFormat": "{{datname}} - idle"
|
||||
},
|
||||
{
|
||||
"expr": "pg_stat_activity_count{state=\"idle in transaction\"}",
|
||||
"legendFormat": "{{datname}} - idle tx"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Total Connections",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(pg_stat_activity_count)",
|
||||
"legendFormat": "Total connections"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Max Connections",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_settings_max_connections",
|
||||
"legendFormat": "Max connections"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Transaction Rate (Commits vs Rollbacks)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(pg_stat_database_xact_commit[5m])",
|
||||
"legendFormat": "{{datname}} - commits"
|
||||
},
|
||||
{
|
||||
"expr": "rate(pg_stat_database_xact_rollback[5m])",
|
||||
"legendFormat": "{{datname}} - rollbacks"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Cache Hit Ratio",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum(rate(pg_stat_io_blocks_read_total[5m])) / (sum(rate(pg_stat_io_blocks_read_total[5m])) + sum(rate(pg_stat_io_blocks_hit_total[5m])))))",
|
||||
"legendFormat": "Cache hit ratio %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Slow Queries (> 30s)",
|
||||
"type": "table",
|
||||
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_slow_queries{duration_ms > 30000}",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"query": "Query",
|
||||
"duration_ms": "Duration (ms)",
|
||||
"datname": "Database"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Dead Tuples by Table",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_stat_user_tables_n_dead_tup",
|
||||
"legendFormat": "{{schemaname}}.{{relname}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Table Bloat Estimate",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (pg_stat_user_tables_n_dead_tup * avg_tuple_size) / (pg_total_relation_size * 8192)",
|
||||
"legendFormat": "{{schemaname}}.{{relname}} bloat %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Replication Lag (bytes)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_replication_lag_bytes",
|
||||
"legendFormat": "{{slot_name}} - {{application_name}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Database Size (GB)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_database_size_bytes / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{datname}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Database Size Growth (per hour)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(pg_database_size_bytes[1h])",
|
||||
"legendFormat": "{{datname}} - bytes/hour"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Lock Counts by Type",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_locks_count",
|
||||
"legendFormat": "{{datname}} - {{locktype}} - {{mode}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"title": "Query Duration (p95)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 40, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(pg_query_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "p95"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
node-exporter-dashboard.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - Node Exporter Infrastructure",
|
||||
"tags": ["bakery-ia", "node-exporter", "infrastructure"],
|
||||
"timezone": "browser",
|
||||
"refresh": "15s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "CPU Usage by Node",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
||||
"legendFormat": "{{instance}} - {{cpu}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Average CPU Usage",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
||||
"legendFormat": "Average CPU %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "CPU Load (1m, 5m, 15m)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg(node_load1)",
|
||||
"legendFormat": "1m"
|
||||
},
|
||||
{
|
||||
"expr": "avg(node_load5)",
|
||||
"legendFormat": "5m"
|
||||
},
|
||||
{
|
||||
"expr": "avg(node_load15)",
|
||||
"legendFormat": "15m"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Memory Usage by Node",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Memory Used (GB)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Memory Available (GB)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_MemAvailable_bytes / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Disk I/O Read Rate (MB/s)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_disk_read_bytes_total[5m]) / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - {{device}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Disk I/O Write Rate (MB/s)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_disk_written_bytes_total[5m]) / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - {{device}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Disk I/O Operations (IOPS)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 24, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_disk_reads_completed_total[5m]) + rate(node_disk_writes_completed_total[5m])",
|
||||
"legendFormat": "{{instance}} - {{device}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Network Receive Rate (Mbps)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - {{device}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Network Transmit Rate (Mbps)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) * 8 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - {{device}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Network Errors",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_errs_total[5m]) + rate(node_network_transmit_errs_total[5m])",
|
||||
"legendFormat": "{{instance}} - {{device}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"title": "Filesystem Usage by Mount",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 40, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes))",
|
||||
"legendFormat": "{{instance}} - {{mountpoint}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"title": "Filesystem Available (GB)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_filesystem_avail_bytes / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - {{mountpoint}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"title": "Filesystem Size (GB)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_filesystem_size_bytes / 1024 / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - {{mountpoint}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"title": "Load Average (1m, 5m, 15m)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load1",
|
||||
"legendFormat": "{{instance}} - 1m"
|
||||
},
|
||||
{
|
||||
"expr": "node_load5",
|
||||
"legendFormat": "{{instance}} - 5m"
|
||||
},
|
||||
{
|
||||
"expr": "node_load15",
|
||||
"legendFormat": "{{instance}} - 15m"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"title": "System Up Time",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_boot_time_seconds",
|
||||
"legendFormat": "{{instance}} - uptime"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"title": "Context Switches",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 56, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_context_switches_total[5m])",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"title": "Interrupts",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 56, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_intr_total[5m])",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
alertmanager-dashboard.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - AlertManager Monitoring",
|
||||
"tags": ["bakery-ia", "alertmanager", "alerting"],
|
||||
"timezone": "browser",
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Active Alerts by Severity",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by (severity) (ALERTS{alertstate=\"firing\"})",
|
||||
"legendFormat": "{{severity}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Total Active Alerts",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\"})",
|
||||
"legendFormat": "Active alerts"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Critical Alerts",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"firing\", severity=\"critical\"})",
|
||||
"legendFormat": "Critical"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Alert Firing Rate (per minute)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(alertmanager_alerts_fired_total[1m])",
|
||||
"legendFormat": "Alerts fired/min"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Alert Resolution Rate (per minute)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(alertmanager_alerts_resolved_total[1m])",
|
||||
"legendFormat": "Alerts resolved/min"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Notification Success Rate",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (rate(alertmanager_notifications_total{status=\"success\"}[5m]) / rate(alertmanager_notifications_total[5m]))",
|
||||
"legendFormat": "Success rate %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Notification Failures",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(alertmanager_notifications_total{status=\"failed\"}[5m])",
|
||||
"legendFormat": "{{integration}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Silenced Alerts",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(ALERTS{alertstate=\"silenced\"})",
|
||||
"legendFormat": "Silenced"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "AlertManager Cluster Size",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(alertmanager_cluster_peers)",
|
||||
"legendFormat": "Cluster peers"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "AlertManager Peers",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 24, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "alertmanager_cluster_peers",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Cluster Status",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 24, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=\"alertmanager\"}",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Alerts by Group",
|
||||
"type": "table",
|
||||
"gridPos": {"x": 0, "y": 28, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count by (alertname) (ALERTS{alertstate=\"firing\"})",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"alertname": "Alert Name",
|
||||
"Value": "Count"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"title": "Alert Duration (p99)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 28, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, rate(alertmanager_alert_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "p99 duration"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"title": "Processing Time",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 36, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(alertmanager_receiver_processing_duration_seconds_sum[5m]) / rate(alertmanager_receiver_processing_duration_seconds_count[5m])",
|
||||
"legendFormat": "{{receiver}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"title": "Memory Usage",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 36, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "process_resident_memory_bytes{job=\"alertmanager\"} / 1024 / 1024",
|
||||
"legendFormat": "{{instance}} - MB"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
business-metrics-dashboard.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - Business Metrics & KPIs",
|
||||
"tags": ["bakery-ia", "business-metrics", "kpis"],
|
||||
"timezone": "browser",
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Requests per Service (Rate)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (service) (rate(http_requests_total[5m]))",
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Total Request Rate",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total[5m]))",
|
||||
"legendFormat": "requests/sec"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Peak Request Rate (5m)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 0, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max(sum(rate(http_requests_total[5m])))",
|
||||
"legendFormat": "Peak requests/sec"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Error Rates by Service",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))",
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Overall Error Rate",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])))",
|
||||
"legendFormat": "Error %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "4xx Error Rate",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (sum(rate(http_requests_total{status_code=~\"4..\"}[5m])) / sum(rate(http_requests_total[5m])))",
|
||||
"legendFormat": "4xx %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "P95 Latency by Service (ms)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
|
||||
"legendFormat": "{{service}} p95"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "P99 Latency by Service (ms)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 16, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m]))) * 1000",
|
||||
"legendFormat": "{{service}} p99"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Average Latency (ms)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 0, "y": 24, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(sum(rate(http_request_duration_seconds_sum[5m])) / sum(rate(http_request_duration_seconds_count[5m]))) * 1000",
|
||||
"legendFormat": "Avg latency ms"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Active Tenants",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 6, "y": 24, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(count by (tenant_id) (rate(http_requests_total[5m])))",
|
||||
"legendFormat": "Active tenants"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Requests per Tenant",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 24, "w": 12, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (tenant_id) (rate(http_requests_total[5m]))",
|
||||
"legendFormat": "Tenant {{tenant_id}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Alert Generation Rate (per minute)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 32, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(ALERTS_FOR_STATE[1m])",
|
||||
"legendFormat": "{{alertname}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"title": "Training Job Success Rate",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 32, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (sum(training_job_completed_total{status=\"success\"}) / sum(training_job_completed_total))",
|
||||
"legendFormat": "Success rate %"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"title": "Training Jobs in Progress",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 0, "y": 40, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(training_job_in_progress)",
|
||||
"legendFormat": "Jobs running"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"title": "Training Job Completion Time (p95, minutes)",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 6, "y": 40, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, training_job_duration_seconds) / 60",
|
||||
"legendFormat": "p95 minutes"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"title": "Failed Training Jobs",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 40, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(training_job_completed_total{status=\"failed\"})",
|
||||
"legendFormat": "Failed jobs"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"title": "Total Training Jobs Completed",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 40, "w": 6, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(training_job_completed_total)",
|
||||
"legendFormat": "Total completed"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"title": "API Health Status",
|
||||
"type": "table",
|
||||
"gridPos": {"x": 0, "y": 48, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=\"bakery-services\"}",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"service": "Service",
|
||||
"Value": "Status",
|
||||
"instance": "Instance"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"title": "Service Success Rate (%)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 48, "w": 12, "h": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum by (service) (rate(http_requests_total[5m]))))",
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"title": "Requests Processed Today",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 0, "y": 56, "w": 12, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(http_requests_total[24h]))",
|
||||
"legendFormat": "Requests (24h)"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"title": "Distinct Users Today",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 56, "w": 12, "h": 4},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(count by (user_id) (increase(http_requests_total{user_id!=\"\"}[24h])))",
|
||||
"legendFormat": "Users (24h)"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -1,177 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboards
|
||||
namespace: monitoring
|
||||
data:
|
||||
gateway-metrics.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - Gateway Metrics",
|
||||
"tags": ["bakery-ia", "gateway"],
|
||||
"timezone": "browser",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Request Rate by Endpoint",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "rate(http_requests_total{service=\"gateway\"}[5m])",
|
||||
"legendFormat": "{{method}} {{endpoint}}"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "P95 Request Latency",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"gateway\"}[5m]))",
|
||||
"legendFormat": "{{endpoint}} p95"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Error Rate (5xx)",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 8, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "rate(http_requests_total{service=\"gateway\",status_code=~\"5..\"}[5m])",
|
||||
"legendFormat": "{{endpoint}} errors"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Active Requests",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 12, "y": 8, "w": 6, "h": 4},
|
||||
"targets": [{
|
||||
"expr": "sum(rate(http_requests_total{service=\"gateway\"}[1m]))"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Authentication Success Rate",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 18, "y": 8, "w": 6, "h": 4},
|
||||
"targets": [{
|
||||
"expr": "rate(gateway_auth_responses_total[5m]) / rate(gateway_auth_requests_total[5m]) * 100"
|
||||
}]
|
||||
}
|
||||
],
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1
|
||||
}
|
||||
}
|
||||
|
||||
services-overview.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - Services Overview",
|
||||
"tags": ["bakery-ia", "services"],
|
||||
"timezone": "browser",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Request Rate by Service",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "sum by (service) (rate(http_requests_total[5m]))",
|
||||
"legendFormat": "{{service}}"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "P99 Latency by Service",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 0, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "histogram_quantile(0.99, sum by (service, le) (rate(http_request_duration_seconds_bucket[5m])))",
|
||||
"legendFormat": "{{service}} p99"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Error Rate by Service",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 8, "w": 24, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "sum by (service) (rate(http_requests_total{status_code=~\"5..\"}[5m]))",
|
||||
"legendFormat": "{{service}}"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Service Health Status",
|
||||
"type": "table",
|
||||
"gridPos": {"x": 0, "y": 16, "w": 24, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "up{job=\"bakery-services\"}",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}],
|
||||
"transformations": [{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"service": "Service Name",
|
||||
"Value": "Status"
|
||||
}
|
||||
}
|
||||
}]
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1
|
||||
}
|
||||
}
|
||||
|
||||
circuit-breakers.json: |
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "Bakery IA - Circuit Breakers",
|
||||
"tags": ["bakery-ia", "reliability"],
|
||||
"timezone": "browser",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Circuit Breaker States",
|
||||
"type": "stat",
|
||||
"gridPos": {"x": 0, "y": 0, "w": 24, "h": 4},
|
||||
"targets": [{
|
||||
"expr": "circuit_breaker_state",
|
||||
"legendFormat": "{{service}} - {{state}}"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Circuit Breaker Trips",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 0, "y": 4, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "rate(circuit_breaker_opened_total[5m])",
|
||||
"legendFormat": "{{service}}"
|
||||
}]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Rejected Requests",
|
||||
"type": "graph",
|
||||
"gridPos": {"x": 12, "y": 4, "w": 12, "h": 8},
|
||||
"targets": [{
|
||||
"expr": "rate(circuit_breaker_rejected_total[5m])",
|
||||
"legendFormat": "{{service}}"
|
||||
}]
|
||||
}
|
||||
],
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 16,
|
||||
"version": 1
|
||||
}
|
||||
}
|
||||
@@ -1,166 +0,0 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-datasources
|
||||
namespace: monitoring
|
||||
data:
|
||||
prometheus.yaml: |
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboards-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
dashboards.yaml: |
|
||||
apiVersion: 1
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: 'Bakery IA'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
- name: 'extended'
|
||||
orgId: 1
|
||||
folder: 'Bakery IA - Extended'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards-extended
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: grafana
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: grafana
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: grafana
|
||||
spec:
|
||||
containers:
|
||||
- name: grafana
|
||||
image: grafana/grafana:12.3.0
|
||||
ports:
|
||||
- containerPort: 3000
|
||||
name: http
|
||||
env:
|
||||
- name: GF_SECURITY_ADMIN_USER
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: grafana-admin
|
||||
key: admin-user
|
||||
- name: GF_SECURITY_ADMIN_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: grafana-admin
|
||||
key: admin-password
|
||||
- name: GF_SERVER_ROOT_URL
|
||||
value: "http://monitoring.bakery-ia.local/grafana"
|
||||
- name: GF_SERVER_SERVE_FROM_SUB_PATH
|
||||
value: "true"
|
||||
- name: GF_AUTH_ANONYMOUS_ENABLED
|
||||
value: "false"
|
||||
- name: GF_INSTALL_PLUGINS
|
||||
value: ""
|
||||
volumeMounts:
|
||||
- name: grafana-storage
|
||||
mountPath: /var/lib/grafana
|
||||
- name: grafana-datasources
|
||||
mountPath: /etc/grafana/provisioning/datasources
|
||||
- name: grafana-dashboards-config
|
||||
mountPath: /etc/grafana/provisioning/dashboards
|
||||
- name: grafana-dashboards
|
||||
mountPath: /var/lib/grafana/dashboards
|
||||
- name: grafana-dashboards-extended
|
||||
mountPath: /var/lib/grafana/dashboards-extended
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /api/health
|
||||
port: 3000
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /api/health
|
||||
port: 3000
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: grafana-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: grafana-storage
|
||||
- name: grafana-datasources
|
||||
configMap:
|
||||
name: grafana-datasources
|
||||
- name: grafana-dashboards-config
|
||||
configMap:
|
||||
name: grafana-dashboards-config
|
||||
- name: grafana-dashboards
|
||||
configMap:
|
||||
name: grafana-dashboards
|
||||
- name: grafana-dashboards-extended
|
||||
configMap:
|
||||
name: grafana-dashboards-extended
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: grafana-storage
|
||||
namespace: monitoring
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: grafana
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 3000
|
||||
targetPort: 3000
|
||||
protocol: TCP
|
||||
name: http
|
||||
selector:
|
||||
app: grafana
|
||||
@@ -1,100 +0,0 @@
|
||||
---
|
||||
# PodDisruptionBudgets ensure minimum availability during voluntary disruptions
|
||||
# (node drains, rolling updates, etc.)
|
||||
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: prometheus-pdb
|
||||
namespace: monitoring
|
||||
spec:
|
||||
minAvailable: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: prometheus
|
||||
|
||||
---
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: alertmanager-pdb
|
||||
namespace: monitoring
|
||||
spec:
|
||||
minAvailable: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app: alertmanager
|
||||
|
||||
---
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
name: grafana-pdb
|
||||
namespace: monitoring
|
||||
spec:
|
||||
minAvailable: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: grafana
|
||||
|
||||
---
|
||||
# ResourceQuota limits total resources in monitoring namespace
|
||||
apiVersion: v1
|
||||
kind: ResourceQuota
|
||||
metadata:
|
||||
name: monitoring-quota
|
||||
namespace: monitoring
|
||||
spec:
|
||||
hard:
|
||||
# Compute resources
|
||||
requests.cpu: "10"
|
||||
requests.memory: "16Gi"
|
||||
limits.cpu: "20"
|
||||
limits.memory: "32Gi"
|
||||
|
||||
# Storage
|
||||
persistentvolumeclaims: "10"
|
||||
requests.storage: "100Gi"
|
||||
|
||||
# Object counts
|
||||
pods: "50"
|
||||
services: "20"
|
||||
configmaps: "30"
|
||||
secrets: "20"
|
||||
|
||||
---
|
||||
# LimitRange sets default resource limits for pods in monitoring namespace
|
||||
apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: monitoring-limits
|
||||
namespace: monitoring
|
||||
spec:
|
||||
limits:
|
||||
# Default container limits
|
||||
- max:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
min:
|
||||
cpu: "10m"
|
||||
memory: "16Mi"
|
||||
default:
|
||||
cpu: "500m"
|
||||
memory: "512Mi"
|
||||
defaultRequest:
|
||||
cpu: "100m"
|
||||
memory: "128Mi"
|
||||
type: Container
|
||||
|
||||
# Pod limits
|
||||
- max:
|
||||
cpu: "4"
|
||||
memory: "8Gi"
|
||||
type: Pod
|
||||
|
||||
# PVC limits
|
||||
- max:
|
||||
storage: "50Gi"
|
||||
min:
|
||||
storage: "1Gi"
|
||||
type: PersistentVolumeClaim
|
||||
@@ -1,42 +0,0 @@
|
||||
---
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: monitoring-ingress
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/rewrite-target: /$2
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "false"
|
||||
spec:
|
||||
rules:
|
||||
- host: monitoring.bakery-ia.local
|
||||
http:
|
||||
paths:
|
||||
- path: /grafana(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: grafana
|
||||
port:
|
||||
number: 3000
|
||||
- path: /prometheus(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: prometheus-external
|
||||
port:
|
||||
number: 9090
|
||||
- path: /jaeger(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: jaeger-query
|
||||
port:
|
||||
number: 16686
|
||||
- path: /alertmanager(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: alertmanager-external
|
||||
port:
|
||||
number: 9093
|
||||
@@ -1,190 +0,0 @@
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: jaeger
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: jaeger
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: jaeger
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: jaeger
|
||||
spec:
|
||||
containers:
|
||||
- name: jaeger
|
||||
image: jaegertracing/all-in-one:1.51
|
||||
env:
|
||||
- name: COLLECTOR_ZIPKIN_HOST_PORT
|
||||
value: ":9411"
|
||||
- name: COLLECTOR_OTLP_ENABLED
|
||||
value: "true"
|
||||
- name: SPAN_STORAGE_TYPE
|
||||
value: "badger"
|
||||
- name: BADGER_EPHEMERAL
|
||||
value: "false"
|
||||
- name: BADGER_DIRECTORY_VALUE
|
||||
value: "/badger/data"
|
||||
- name: BADGER_DIRECTORY_KEY
|
||||
value: "/badger/key"
|
||||
ports:
|
||||
- containerPort: 5775
|
||||
protocol: UDP
|
||||
name: zipkin-compact
|
||||
- containerPort: 6831
|
||||
protocol: UDP
|
||||
name: jaeger-compact
|
||||
- containerPort: 6832
|
||||
protocol: UDP
|
||||
name: jaeger-binary
|
||||
- containerPort: 5778
|
||||
protocol: TCP
|
||||
name: config-rest
|
||||
- containerPort: 16686
|
||||
protocol: TCP
|
||||
name: query
|
||||
- containerPort: 14250
|
||||
protocol: TCP
|
||||
name: grpc
|
||||
- containerPort: 14268
|
||||
protocol: TCP
|
||||
name: c-tchan-trft
|
||||
- containerPort: 14269
|
||||
protocol: TCP
|
||||
name: admin-http
|
||||
- containerPort: 9411
|
||||
protocol: TCP
|
||||
name: zipkin
|
||||
- containerPort: 4317
|
||||
protocol: TCP
|
||||
name: otlp-grpc
|
||||
- containerPort: 4318
|
||||
protocol: TCP
|
||||
name: otlp-http
|
||||
volumeMounts:
|
||||
- name: jaeger-storage
|
||||
mountPath: /badger
|
||||
resources:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 14269
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 14269
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: jaeger-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: jaeger-storage
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: jaeger-storage
|
||||
namespace: monitoring
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: jaeger-query
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: jaeger
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 16686
|
||||
targetPort: 16686
|
||||
protocol: TCP
|
||||
name: query
|
||||
selector:
|
||||
app: jaeger
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: jaeger-collector
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: jaeger
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 14268
|
||||
targetPort: 14268
|
||||
protocol: TCP
|
||||
name: c-tchan-trft
|
||||
- port: 14250
|
||||
targetPort: 14250
|
||||
protocol: TCP
|
||||
name: grpc
|
||||
- port: 9411
|
||||
targetPort: 9411
|
||||
protocol: TCP
|
||||
name: zipkin
|
||||
- port: 4317
|
||||
targetPort: 4317
|
||||
protocol: TCP
|
||||
name: otlp-grpc
|
||||
- port: 4318
|
||||
targetPort: 4318
|
||||
protocol: TCP
|
||||
name: otlp-http
|
||||
selector:
|
||||
app: jaeger
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: jaeger-agent
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: jaeger
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- port: 5775
|
||||
targetPort: 5775
|
||||
protocol: UDP
|
||||
name: zipkin-compact
|
||||
- port: 6831
|
||||
targetPort: 6831
|
||||
protocol: UDP
|
||||
name: jaeger-compact
|
||||
- port: 6832
|
||||
targetPort: 6832
|
||||
protocol: UDP
|
||||
name: jaeger-binary
|
||||
- port: 5778
|
||||
targetPort: 5778
|
||||
protocol: TCP
|
||||
name: config-rest
|
||||
selector:
|
||||
app: jaeger
|
||||
@@ -1,18 +1,20 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
# Minimal Monitoring Infrastructure
|
||||
# SigNoz is now managed via Helm in the 'signoz' namespace
|
||||
# This kustomization only maintains:
|
||||
# - Namespace for legacy resources (if needed)
|
||||
# - Node exporter for infrastructure metrics
|
||||
# - PostgreSQL exporter for database metrics
|
||||
# - Optional OTEL collector (can be disabled if using SigNoz's built-in collector)
|
||||
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- secrets.yaml
|
||||
- prometheus.yaml
|
||||
- alert-rules.yaml
|
||||
- alertmanager.yaml
|
||||
- alertmanager-init.yaml
|
||||
- grafana.yaml
|
||||
- grafana-dashboards.yaml
|
||||
- grafana-dashboards-extended.yaml
|
||||
- postgres-exporter.yaml
|
||||
# Exporters for metrics collection
|
||||
- node-exporter.yaml
|
||||
- jaeger.yaml
|
||||
- ha-policies.yaml
|
||||
- ingress.yaml
|
||||
- postgres-exporter.yaml
|
||||
# Optional: Keep OTEL collector or use SigNoz's built-in one
|
||||
# Uncomment if you want a dedicated OTEL collector in monitoring namespace
|
||||
# - otel-collector.yaml
|
||||
|
||||
@@ -0,0 +1,167 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: otel-collector-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
otel-collector-config.yaml: |
|
||||
extensions:
|
||||
health_check:
|
||||
endpoint: 0.0.0.0:13133
|
||||
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 10s
|
||||
send_batch_size: 1024
|
||||
|
||||
# Memory limiter to prevent OOM
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 512
|
||||
spike_limit_mib: 128
|
||||
|
||||
exporters:
|
||||
# Export metrics to Prometheus
|
||||
prometheus:
|
||||
endpoint: "0.0.0.0:8889"
|
||||
namespace: otelcol
|
||||
const_labels:
|
||||
source: otel-collector
|
||||
|
||||
# Export to SigNoz
|
||||
otlp/signoz:
|
||||
endpoint: "signoz-query-service.monitoring.svc.cluster.local:8080"
|
||||
tls:
|
||||
insecure: true
|
||||
|
||||
# Logging exporter for debugging traces and logs
|
||||
logging:
|
||||
loglevel: info
|
||||
sampling_initial: 5
|
||||
sampling_thereafter: 200
|
||||
|
||||
service:
|
||||
extensions: [health_check]
|
||||
pipelines:
|
||||
# Traces pipeline: receive -> process -> export to SigNoz
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [otlp/signoz, logging]
|
||||
|
||||
# Metrics pipeline: receive -> process -> export to both Prometheus and SigNoz
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [prometheus, otlp/signoz]
|
||||
|
||||
# Logs pipeline: receive -> process -> export to SigNoz
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [otlp/signoz, logging]
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: otel-collector
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: otel-collector
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: otel-collector
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: otel-collector
|
||||
spec:
|
||||
containers:
|
||||
- name: otel-collector
|
||||
image: otel/opentelemetry-collector-contrib:0.91.0
|
||||
args:
|
||||
- --config=/conf/otel-collector-config.yaml
|
||||
ports:
|
||||
- containerPort: 4317
|
||||
protocol: TCP
|
||||
name: otlp-grpc
|
||||
- containerPort: 4318
|
||||
protocol: TCP
|
||||
name: otlp-http
|
||||
- containerPort: 8889
|
||||
protocol: TCP
|
||||
name: prometheus
|
||||
- containerPort: 13133
|
||||
protocol: TCP
|
||||
name: health-check
|
||||
volumeMounts:
|
||||
- name: otel-collector-config
|
||||
mountPath: /conf
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 13133
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 13133
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: otel-collector-config
|
||||
configMap:
|
||||
name: otel-collector-config
|
||||
items:
|
||||
- key: otel-collector-config.yaml
|
||||
path: otel-collector-config.yaml
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: otel-collector
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: otel-collector
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8889"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 4317
|
||||
targetPort: 4317
|
||||
protocol: TCP
|
||||
name: otlp-grpc
|
||||
- port: 4318
|
||||
targetPort: 4318
|
||||
protocol: TCP
|
||||
name: otlp-http
|
||||
- port: 8889
|
||||
targetPort: 8889
|
||||
protocol: TCP
|
||||
name: prometheus
|
||||
selector:
|
||||
app: otel-collector
|
||||
@@ -1,278 +0,0 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: prometheus
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes
|
||||
- nodes/proxy
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups:
|
||||
- extensions
|
||||
resources:
|
||||
- ingresses
|
||||
verbs: ["get", "list", "watch"]
|
||||
- nonResourceURLs: ["/metrics"]
|
||||
verbs: ["get"]
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: prometheus
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: prometheus
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
prometheus.yml: |
|
||||
global:
|
||||
scrape_interval: 30s
|
||||
evaluation_interval: 30s
|
||||
external_labels:
|
||||
cluster: 'bakery-ia'
|
||||
environment: 'production'
|
||||
|
||||
# AlertManager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093
|
||||
- alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093
|
||||
- alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093
|
||||
|
||||
# Load alert rules
|
||||
rule_files:
|
||||
- '/etc/prometheus/rules/*.yml'
|
||||
|
||||
scrape_configs:
|
||||
# Scrape Prometheus itself
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Scrape all bakery-ia services
|
||||
- job_name: 'bakery-services'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- bakery-ia
|
||||
relabel_configs:
|
||||
# Only scrape pods with metrics port
|
||||
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
||||
action: keep
|
||||
regex: http
|
||||
|
||||
# Add service name label
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
||||
target_label: service
|
||||
|
||||
# Add component label
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
||||
target_label: component
|
||||
|
||||
# Add pod name
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: pod
|
||||
|
||||
# Set metrics path
|
||||
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
||||
action: replace
|
||||
target_label: __metrics_path__
|
||||
regex: (.+)
|
||||
|
||||
# Set scrape port
|
||||
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
||||
action: replace
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
target_label: __address__
|
||||
|
||||
# Scrape Kubernetes nodes
|
||||
- job_name: 'kubernetes-nodes'
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
relabel_configs:
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_node_label_(.+)
|
||||
- target_label: __address__
|
||||
replacement: kubernetes.default.svc:443
|
||||
- source_labels: [__meta_kubernetes_node_name]
|
||||
regex: (.+)
|
||||
target_label: __metrics_path__
|
||||
replacement: /api/v1/nodes/${1}/proxy/metrics
|
||||
|
||||
# Scrape AlertManager
|
||||
- job_name: 'alertmanager'
|
||||
static_configs:
|
||||
- targets:
|
||||
- alertmanager-0.alertmanager.monitoring.svc.cluster.local:9093
|
||||
- alertmanager-1.alertmanager.monitoring.svc.cluster.local:9093
|
||||
- alertmanager-2.alertmanager.monitoring.svc.cluster.local:9093
|
||||
|
||||
# Scrape PostgreSQL exporter
|
||||
- job_name: 'postgres-exporter'
|
||||
static_configs:
|
||||
- targets: ['postgres-exporter.monitoring.svc.cluster.local:9187']
|
||||
|
||||
# Scrape Node Exporter
|
||||
- job_name: 'node-exporter'
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
regex: '(.*):10250'
|
||||
replacement: '${1}:9100'
|
||||
target_label: __address__
|
||||
- source_labels: [__meta_kubernetes_node_name]
|
||||
target_label: node
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
serviceName: prometheus
|
||||
replicas: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app: prometheus
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
serviceAccountName: prometheus
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- prometheus
|
||||
topologyKey: kubernetes.io/hostname
|
||||
containers:
|
||||
- name: prometheus
|
||||
image: prom/prometheus:v3.0.1
|
||||
args:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
||||
- '--web.console.templates=/usr/share/prometheus/consoles'
|
||||
- '--web.enable-lifecycle'
|
||||
ports:
|
||||
- containerPort: 9090
|
||||
name: web
|
||||
volumeMounts:
|
||||
- name: prometheus-config
|
||||
mountPath: /etc/prometheus
|
||||
- name: prometheus-rules
|
||||
mountPath: /etc/prometheus/rules
|
||||
- name: prometheus-storage
|
||||
mountPath: /prometheus
|
||||
resources:
|
||||
requests:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
limits:
|
||||
memory: "2Gi"
|
||||
cpu: "1"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /-/healthy
|
||||
port: 9090
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /-/ready
|
||||
port: 9090
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: prometheus-config
|
||||
configMap:
|
||||
name: prometheus-config
|
||||
- name: prometheus-rules
|
||||
configMap:
|
||||
name: prometheus-alert-rules
|
||||
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: prometheus-storage
|
||||
spec:
|
||||
accessModes: [ "ReadWriteOnce" ]
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- port: 9090
|
||||
targetPort: 9090
|
||||
protocol: TCP
|
||||
name: web
|
||||
selector:
|
||||
app: prometheus
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus-external
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 9090
|
||||
targetPort: 9090
|
||||
protocol: TCP
|
||||
name: web
|
||||
selector:
|
||||
app: prometheus
|
||||
@@ -14,9 +14,10 @@ data:
|
||||
DEBUG: "false"
|
||||
LOG_LEVEL: "INFO"
|
||||
|
||||
# Observability Settings
|
||||
# Set to "true" when Jaeger/monitoring stack is deployed
|
||||
ENABLE_TRACING: "false"
|
||||
# Observability Settings - SigNoz enabled
|
||||
ENABLE_TRACING: "true"
|
||||
ENABLE_METRICS: "true"
|
||||
ENABLE_LOGS: "true"
|
||||
|
||||
# Database initialization settings
|
||||
# IMPORTANT: Services NEVER run migrations - they only verify DB is ready
|
||||
@@ -286,12 +287,11 @@ data:
|
||||
LOG_FILE_PATH: "/app/logs"
|
||||
LOG_ROTATION_SIZE: "100MB"
|
||||
LOG_RETENTION_DAYS: "30"
|
||||
PROMETHEUS_ENABLED: "true"
|
||||
PROMETHEUS_RETENTION: "200h"
|
||||
HEALTH_CHECK_TIMEOUT: "30"
|
||||
HEALTH_CHECK_INTERVAL: "30"
|
||||
PROMETHEUS_RETENTION_DAYS: "30"
|
||||
GRAFANA_ROOT_URL: "http://monitoring.bakery-ia.local/grafana"
|
||||
|
||||
# Monitoring Configuration - SigNoz
|
||||
SIGNOZ_ROOT_URL: "http://localhost/signoz"
|
||||
|
||||
# ================================================================
|
||||
# DATA COLLECTION SETTINGS
|
||||
@@ -382,16 +382,20 @@ data:
|
||||
NOMINATIM_CPU_LIMIT: "4"
|
||||
|
||||
# ================================================================
|
||||
# DISTRIBUTED TRACING (Jaeger/OpenTelemetry)
|
||||
# OBSERVABILITY - SigNoz (Unified Monitoring)
|
||||
# ================================================================
|
||||
JAEGER_COLLECTOR_ENDPOINT: "http://jaeger-collector.monitoring:4317"
|
||||
JAEGER_AGENT_HOST: "jaeger-agent.monitoring"
|
||||
JAEGER_AGENT_PORT: "6831"
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: "http://jaeger-collector.monitoring:4317"
|
||||
# OpenTelemetry Configuration - Direct to SigNoz
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.signoz.svc.cluster.local:4317"
|
||||
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
|
||||
OTEL_SERVICE_NAME: "bakery-ia"
|
||||
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
|
||||
|
||||
# SigNoz Endpoints
|
||||
SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080"
|
||||
SIGNOZ_FRONTEND_URL: "http://signoz-frontend.signoz.svc.cluster.local:3301"
|
||||
|
||||
# ================================================================
|
||||
# REPLENISHMENT PLANNING SETTINGS
|
||||
# REPLENISHMENT PLANNING SETTINGS
|
||||
# ================================================================
|
||||
REPLENISHMENT_PROJECTION_HORIZON_DAYS: "7"
|
||||
REPLENISHMENT_SERVICE_LEVEL: "0.95"
|
||||
|
||||
@@ -9,11 +9,14 @@ metadata:
|
||||
|
||||
resources:
|
||||
- ../../base
|
||||
# Monitoring disabled for dev to save resources
|
||||
# - ../../base/components/monitoring
|
||||
# Monitoring enabled for dev environment
|
||||
- ../../base/components/monitoring
|
||||
- dev-ingress.yaml
|
||||
# SigNoz ingress is applied by Tilt (see Tiltfile)
|
||||
# - signoz-ingress.yaml
|
||||
# Dev-Prod Parity: Enable HTTPS with self-signed certificates
|
||||
- dev-certificate.yaml
|
||||
- monitoring-certificate.yaml
|
||||
- cluster-issuer-staging.yaml
|
||||
|
||||
# Exclude nominatim from dev to save resources
|
||||
@@ -608,6 +611,39 @@ patches:
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "300m"
|
||||
# Optional exporters resource patches for dev
|
||||
- target:
|
||||
group: apps
|
||||
version: v1
|
||||
kind: DaemonSet
|
||||
name: node-exporter
|
||||
namespace: monitoring
|
||||
patch: |-
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/resources
|
||||
value:
|
||||
requests:
|
||||
memory: "32Mi"
|
||||
cpu: "25m"
|
||||
limits:
|
||||
memory: "64Mi"
|
||||
cpu: "100m"
|
||||
- target:
|
||||
group: apps
|
||||
version: v1
|
||||
kind: Deployment
|
||||
name: postgres-exporter
|
||||
namespace: monitoring
|
||||
patch: |-
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/resources
|
||||
value:
|
||||
requests:
|
||||
memory: "32Mi"
|
||||
cpu: "25m"
|
||||
limits:
|
||||
memory: "64Mi"
|
||||
cpu: "100m"
|
||||
|
||||
secretGenerator:
|
||||
- name: dev-secrets
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: bakery-dev-monitoring-tls-cert
|
||||
namespace: monitoring
|
||||
spec:
|
||||
# Self-signed certificate for local development
|
||||
secretName: bakery-ia-tls-cert
|
||||
|
||||
# Certificate duration
|
||||
duration: 2160h # 90 days
|
||||
renewBefore: 360h # 15 days
|
||||
|
||||
# Subject configuration
|
||||
subject:
|
||||
organizations:
|
||||
- Bakery IA Development
|
||||
|
||||
# Common name
|
||||
commonName: localhost
|
||||
|
||||
# DNS names this certificate is valid for
|
||||
dnsNames:
|
||||
- localhost
|
||||
- monitoring.bakery-ia.local
|
||||
|
||||
# IP addresses (for localhost)
|
||||
ipAddresses:
|
||||
- 127.0.0.1
|
||||
- ::1
|
||||
|
||||
# Use self-signed issuer for development
|
||||
issuerRef:
|
||||
name: selfsigned-issuer
|
||||
kind: ClusterIssuer
|
||||
group: cert-manager.io
|
||||
|
||||
# Private key configuration
|
||||
privateKey:
|
||||
algorithm: RSA
|
||||
encoding: PKCS1
|
||||
size: 2048
|
||||
|
||||
# Usages
|
||||
usages:
|
||||
- server auth
|
||||
- client auth
|
||||
- digital signature
|
||||
- key encipherment
|
||||
39
infrastructure/kubernetes/overlays/dev/signoz-ingress.yaml
Normal file
39
infrastructure/kubernetes/overlays/dev/signoz-ingress.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
---
|
||||
# SigNoz Ingress for Development (localhost)
|
||||
# SigNoz is deployed via Helm in the 'signoz' namespace
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: signoz-ingress-localhost
|
||||
namespace: signoz
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/rewrite-target: /$2
|
||||
nginx.ingress.kubernetes.io/use-regex: "true"
|
||||
spec:
|
||||
ingressClassName: nginx
|
||||
tls:
|
||||
- hosts:
|
||||
- localhost
|
||||
secretName: bakery-ia-tls-cert
|
||||
rules:
|
||||
- host: localhost
|
||||
http:
|
||||
paths:
|
||||
# SigNoz Frontend UI
|
||||
- path: /signoz(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: signoz-frontend
|
||||
port:
|
||||
number: 3301
|
||||
# SigNoz Query Service API
|
||||
- path: /signoz-api(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: signoz-query-service
|
||||
port:
|
||||
number: 8080
|
||||
@@ -14,6 +14,7 @@ resources:
|
||||
|
||||
patchesStrategicMerge:
|
||||
- storage-patch.yaml
|
||||
- monitoring-ingress-patch.yaml
|
||||
|
||||
labels:
|
||||
- includeSelectors: true
|
||||
@@ -21,6 +22,89 @@ labels:
|
||||
environment: production
|
||||
tier: production
|
||||
|
||||
# SigNoz resource patches for production
|
||||
patches:
|
||||
# SigNoz ClickHouse production configuration
|
||||
- target:
|
||||
group: apps
|
||||
version: v1
|
||||
kind: StatefulSet
|
||||
name: signoz-clickhouse
|
||||
namespace: signoz
|
||||
patch: |-
|
||||
- op: replace
|
||||
path: /spec/replicas
|
||||
value: 2
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/resources
|
||||
value:
|
||||
requests:
|
||||
memory: "2Gi"
|
||||
cpu: "500m"
|
||||
limits:
|
||||
memory: "4Gi"
|
||||
cpu: "1000m"
|
||||
# SigNoz Query Service production configuration
|
||||
- target:
|
||||
group: apps
|
||||
version: v1
|
||||
kind: Deployment
|
||||
name: signoz-query-service
|
||||
namespace: signoz
|
||||
patch: |-
|
||||
- op: replace
|
||||
path: /spec/replicas
|
||||
value: 2
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/resources
|
||||
value:
|
||||
requests:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
limits:
|
||||
memory: "2Gi"
|
||||
cpu: "1000m"
|
||||
# SigNoz AlertManager production configuration
|
||||
- target:
|
||||
group: apps
|
||||
version: v1
|
||||
kind: Deployment
|
||||
name: signoz-alertmanager
|
||||
namespace: signoz
|
||||
patch: |-
|
||||
- op: replace
|
||||
path: /spec/replicas
|
||||
value: 2
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/resources
|
||||
value:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
# SigNoz Frontend production configuration
|
||||
- target:
|
||||
group: apps
|
||||
version: v1
|
||||
kind: Deployment
|
||||
name: signoz-frontend
|
||||
namespace: signoz
|
||||
patch: |-
|
||||
- op: replace
|
||||
path: /spec/replicas
|
||||
value: 2
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/resources
|
||||
value:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
|
||||
images:
|
||||
- name: bakery/auth-service
|
||||
newTag: latest
|
||||
|
||||
@@ -17,14 +17,30 @@ data:
|
||||
REQUEST_TIMEOUT: "30"
|
||||
MAX_CONNECTIONS: "100"
|
||||
|
||||
# Monitoring
|
||||
PROMETHEUS_ENABLED: "true"
|
||||
# Monitoring - SigNoz (Unified Observability)
|
||||
ENABLE_TRACING: "true"
|
||||
ENABLE_METRICS: "true"
|
||||
JAEGER_ENABLED: "true"
|
||||
JAEGER_AGENT_HOST: "jaeger-agent.monitoring.svc.cluster.local"
|
||||
JAEGER_AGENT_PORT: "6831"
|
||||
ENABLE_LOGS: "true"
|
||||
|
||||
# OpenTelemetry Configuration - Direct to SigNoz
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.signoz.svc.cluster.local:4317"
|
||||
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
|
||||
OTEL_SERVICE_NAME: "bakery-ia"
|
||||
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=production,cluster.name=bakery-ia-prod"
|
||||
|
||||
# SigNoz Endpoints
|
||||
SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080"
|
||||
SIGNOZ_FRONTEND_URL: "https://monitoring.bakewise.ai/signoz"
|
||||
SIGNOZ_ROOT_URL: "https://monitoring.bakewise.ai/signoz"
|
||||
|
||||
# Rate Limiting (stricter in production)
|
||||
RATE_LIMIT_ENABLED: "true"
|
||||
RATE_LIMIT_PER_MINUTE: "60"
|
||||
|
||||
# CORS Configuration for Production
|
||||
CORS_ORIGINS: "https://bakewise.ai"
|
||||
CORS_ALLOW_CREDENTIALS: "true"
|
||||
|
||||
# Frontend Configuration
|
||||
VITE_API_URL: "/api"
|
||||
VITE_ENVIRONMENT: "production"
|
||||
|
||||
@@ -16,7 +16,7 @@ metadata:
|
||||
|
||||
# CORS configuration for production
|
||||
nginx.ingress.kubernetes.io/enable-cors: "true"
|
||||
nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakery.yourdomain.com,https://api.yourdomain.com"
|
||||
nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakewise.ai"
|
||||
nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS, PATCH"
|
||||
nginx.ingress.kubernetes.io/cors-allow-headers: "Content-Type, Authorization, X-Requested-With, Accept, Origin"
|
||||
nginx.ingress.kubernetes.io/cors-allow-credentials: "true"
|
||||
@@ -40,12 +40,10 @@ spec:
|
||||
ingressClassName: nginx
|
||||
tls:
|
||||
- hosts:
|
||||
- bakery.yourdomain.com
|
||||
- api.yourdomain.com
|
||||
- monitoring.yourdomain.com
|
||||
- bakewise.ai
|
||||
secretName: bakery-ia-prod-tls-cert
|
||||
rules:
|
||||
- host: bakery.yourdomain.com
|
||||
- host: bakewise.ai
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
@@ -55,7 +53,7 @@ spec:
|
||||
name: frontend-service
|
||||
port:
|
||||
number: 3000
|
||||
- path: /api
|
||||
- path: /api/v1
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
@@ -63,31 +61,4 @@ spec:
|
||||
port:
|
||||
number: 8000
|
||||
|
||||
- host: api.yourdomain.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: gateway-service
|
||||
port:
|
||||
number: 8000
|
||||
|
||||
- host: monitoring.yourdomain.com
|
||||
http:
|
||||
paths:
|
||||
- path: /grafana
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: grafana-service
|
||||
port:
|
||||
number: 3000
|
||||
- path: /prometheus
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: prometheus-service
|
||||
port:
|
||||
number: 9090
|
||||
# Monitoring (monitoring.bakewise.ai) is now handled by signoz-ingress.yaml in the signoz namespace
|
||||
|
||||
78
infrastructure/kubernetes/overlays/prod/signoz-ingress.yaml
Normal file
78
infrastructure/kubernetes/overlays/prod/signoz-ingress.yaml
Normal file
@@ -0,0 +1,78 @@
|
||||
---
|
||||
# SigNoz Ingress for Production
|
||||
# SigNoz is deployed via Helm in the 'signoz' namespace
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: signoz-ingress-prod
|
||||
namespace: signoz
|
||||
labels:
|
||||
app.kubernetes.io/name: signoz
|
||||
app.kubernetes.io/component: ingress
|
||||
annotations:
|
||||
# Nginx ingress controller annotations
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "50m"
|
||||
nginx.ingress.kubernetes.io/proxy-connect-timeout: "600"
|
||||
nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
|
||||
nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
|
||||
nginx.ingress.kubernetes.io/rewrite-target: /$2
|
||||
nginx.ingress.kubernetes.io/use-regex: "true"
|
||||
|
||||
# CORS configuration
|
||||
nginx.ingress.kubernetes.io/enable-cors: "true"
|
||||
nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakewise.ai,https://monitoring.bakewise.ai"
|
||||
nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS, PATCH"
|
||||
nginx.ingress.kubernetes.io/cors-allow-headers: "Content-Type, Authorization, X-Requested-With, Accept, Origin"
|
||||
nginx.ingress.kubernetes.io/cors-allow-credentials: "true"
|
||||
|
||||
# Security headers
|
||||
nginx.ingress.kubernetes.io/configuration-snippet: |
|
||||
more_set_headers "X-Frame-Options: SAMEORIGIN";
|
||||
more_set_headers "X-Content-Type-Options: nosniff";
|
||||
more_set_headers "X-XSS-Protection: 1; mode=block";
|
||||
more_set_headers "Referrer-Policy: strict-origin-when-cross-origin";
|
||||
|
||||
# Rate limiting
|
||||
nginx.ingress.kubernetes.io/limit-rps: "100"
|
||||
nginx.ingress.kubernetes.io/limit-connections: "50"
|
||||
|
||||
# Cert-manager annotations for automatic certificate issuance
|
||||
cert-manager.io/cluster-issuer: "letsencrypt-production"
|
||||
cert-manager.io/acme-challenge-type: http01
|
||||
|
||||
spec:
|
||||
ingressClassName: nginx
|
||||
tls:
|
||||
- hosts:
|
||||
- monitoring.bakewise.ai
|
||||
secretName: signoz-prod-tls-cert
|
||||
rules:
|
||||
- host: monitoring.bakewise.ai
|
||||
http:
|
||||
paths:
|
||||
# SigNoz Frontend UI
|
||||
- path: /signoz(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: signoz-frontend
|
||||
port:
|
||||
number: 3301
|
||||
# SigNoz Query Service API
|
||||
- path: /signoz-api(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: signoz-query-service
|
||||
port:
|
||||
number: 8080
|
||||
# SigNoz AlertManager
|
||||
- path: /signoz-alerts(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: signoz-alertmanager
|
||||
port:
|
||||
number: 9093
|
||||
79
infrastructure/kubernetes/signoz-values.yaml
Normal file
79
infrastructure/kubernetes/signoz-values.yaml
Normal file
@@ -0,0 +1,79 @@
|
||||
# SigNoz Helm Chart Values - Customized for Bakery IA
|
||||
# https://github.com/SigNoz/charts
|
||||
|
||||
# Global settings
|
||||
global:
|
||||
storageClass: "standard"
|
||||
|
||||
# Frontend configuration
|
||||
frontend:
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 3301
|
||||
ingress:
|
||||
enabled: true
|
||||
hosts:
|
||||
- host: localhost
|
||||
paths:
|
||||
- path: /signoz
|
||||
pathType: Prefix
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/rewrite-target: /$2
|
||||
|
||||
# Query Service configuration
|
||||
queryService:
|
||||
replicaCount: 1
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 512Mi
|
||||
|
||||
# AlertManager configuration
|
||||
alertmanager:
|
||||
replicaCount: 1
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
|
||||
# ClickHouse configuration
|
||||
clickhouse:
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
|
||||
# OpenTelemetry Collector configuration
|
||||
otelCollector:
|
||||
enabled: true
|
||||
config:
|
||||
exporters:
|
||||
otlp:
|
||||
endpoint: "signoz-query-service:8080"
|
||||
service:
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
exporters: [otlp]
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
exporters: [otlp]
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
exporters: [otlp]
|
||||
|
||||
# Resource optimization for development
|
||||
# These can be increased for production
|
||||
development: true
|
||||
@@ -228,6 +228,12 @@ setup() {
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
print_success "Colima started successfully"
|
||||
|
||||
# Increase inotify limits for Colima to prevent "too many open files" errors
|
||||
print_status "Increasing inotify limits in Colima VM..."
|
||||
colima ssh --profile k8s-local "sudo sysctl -w fs.inotify.max_user_watches=524288"
|
||||
colima ssh --profile k8s-local "sudo sysctl -w fs.inotify.max_user_instances=512"
|
||||
print_success "Inotify limits increased"
|
||||
else
|
||||
print_error "Failed to start Colima"
|
||||
exit 1
|
||||
@@ -261,23 +267,23 @@ setup() {
|
||||
|
||||
# 4. Connect registry to Kind network
|
||||
connect_registry_to_kind
|
||||
|
||||
# 3. Install NGINX Ingress Controller
|
||||
|
||||
# 5. Install NGINX Ingress Controller
|
||||
print_status "Installing NGINX Ingress Controller..."
|
||||
|
||||
|
||||
# Apply the ingress-nginx manifest
|
||||
kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml
|
||||
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
print_success "NGINX Ingress Controller manifest applied"
|
||||
else
|
||||
print_error "Failed to apply NGINX Ingress Controller manifest"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
# Wait for ingress-nginx pods to be ready with retry logic
|
||||
wait_for_pods "ingress-nginx" "app.kubernetes.io/component=controller" 300
|
||||
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
print_error "NGINX Ingress Controller failed to become ready"
|
||||
print_status "Checking pod status for debugging..."
|
||||
@@ -285,30 +291,10 @@ setup() {
|
||||
kubectl describe pods -n ingress-nginx
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 4. Configure permanent localhost access
|
||||
print_status "Configuring localhost access via NodePort..."
|
||||
|
||||
# Check if service exists
|
||||
if kubectl get svc ingress-nginx-controller -n ingress-nginx &>/dev/null; then
|
||||
# Patch the service to expose NodePorts
|
||||
kubectl patch svc ingress-nginx-controller \
|
||||
-n ingress-nginx \
|
||||
--type merge \
|
||||
-p '{"spec":{"type":"NodePort","ports":[{"name":"http","port":80,"targetPort":"http","nodePort":30080},{"name":"https","port":443,"targetPort":"https","nodePort":30443}]}}'
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
print_success "NodePort configuration applied"
|
||||
else
|
||||
print_error "Failed to patch Ingress service"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
print_error "Ingress NGINX controller service not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 5. Verify port mappings from kind-config.yaml
|
||||
|
||||
print_success "NGINX Ingress Controller ready (using Kind's built-in NodePort configuration)"
|
||||
|
||||
# 6. Verify port mappings from kind-config.yaml
|
||||
print_status "Verifying port mappings from configuration..."
|
||||
|
||||
# Extract ports from kind-config.yaml
|
||||
@@ -323,24 +309,24 @@ setup() {
|
||||
echo " - Colima profile: k8s-local"
|
||||
echo " - Kind cluster: $CLUSTER_NAME"
|
||||
echo " - Local registry: localhost:5001"
|
||||
echo " - Direct port mappings (from kind-config.yaml):"
|
||||
echo " Frontend: localhost:3000 -> container:30300"
|
||||
echo " Gateway: localhost:8000 -> container:30800"
|
||||
echo " - Ingress access:"
|
||||
echo " HTTP: localhost:${HTTP_HOST_PORT} -> ingress:30080"
|
||||
echo " HTTPS: localhost:${HTTPS_HOST_PORT} -> ingress:30443"
|
||||
echo " - NodePort access:"
|
||||
echo " HTTP: localhost:30080"
|
||||
echo " HTTPS: localhost:30443"
|
||||
echo "----------------------------------------"
|
||||
print_status "To access your applications:"
|
||||
echo " - Use Ingress via: http://localhost:${HTTP_HOST_PORT}"
|
||||
echo " - Direct NodePort: http://localhost:30080"
|
||||
echo ""
|
||||
print_status "Port Mappings (configured in kind-config.yaml):"
|
||||
echo " - HTTP Ingress: localhost:${HTTP_HOST_PORT} -> Kind NodePort 30080"
|
||||
echo " - HTTPS Ingress: localhost:${HTTPS_HOST_PORT} -> Kind NodePort 30443"
|
||||
echo " - Frontend Direct: localhost:3000 -> container:30300"
|
||||
echo " - Gateway Direct: localhost:8000 -> container:30800"
|
||||
echo ""
|
||||
print_status "How to access your application:"
|
||||
echo " 1. Start Tilt: tilt up"
|
||||
echo " 2. Access via:"
|
||||
echo " - Ingress: http://localhost (or https://localhost)"
|
||||
echo " - Direct: http://localhost:3000 (frontend), http://localhost:8000 (gateway)"
|
||||
echo " - Tilt UI: http://localhost:10350"
|
||||
echo "----------------------------------------"
|
||||
print_status "Local Registry Information:"
|
||||
echo " - Registry URL: localhost:5001"
|
||||
echo " - Images will be pushed to: localhost:5001/bakery/<service>"
|
||||
echo " - Update your Tiltfile with: default_registry('localhost:5001')"
|
||||
echo " - Images pushed to: localhost:5001/bakery/<service>"
|
||||
echo " - Tiltfile already configured: default_registry('localhost:5001')"
|
||||
echo "----------------------------------------"
|
||||
}
|
||||
|
||||
|
||||
@@ -1,22 +1,50 @@
|
||||
"""Main FastAPI application for AI Insights Service."""
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi import FastAPI, Response
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from contextlib import asynccontextmanager
|
||||
import structlog
|
||||
import os
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.database import init_db, close_db
|
||||
from app.api import insights
|
||||
from shared.monitoring.logging import setup_logging
|
||||
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
|
||||
|
||||
# Configure structured logging
|
||||
structlog.configure(
|
||||
processors=[
|
||||
structlog.processors.TimeStamper(fmt="iso"),
|
||||
structlog.processors.JSONRenderer()
|
||||
]
|
||||
)
|
||||
# OpenTelemetry imports
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
|
||||
# Configure OpenTelemetry tracing
|
||||
def setup_tracing(service_name: str = "ai-insights"):
|
||||
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
|
||||
resource = Resource.create({"service.name": service_name})
|
||||
|
||||
otlp_exporter = OTLPSpanExporter(
|
||||
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"),
|
||||
insecure=True
|
||||
)
|
||||
|
||||
provider = TracerProvider(resource=resource)
|
||||
processor = BatchSpanProcessor(otlp_exporter)
|
||||
provider.add_span_processor(processor)
|
||||
trace.set_tracer_provider(provider)
|
||||
|
||||
return provider
|
||||
|
||||
# Initialize tracing
|
||||
tracer_provider = setup_tracing("ai-insights")
|
||||
|
||||
# Setup logging
|
||||
setup_logging("ai-insights", getattr(settings, 'LOG_LEVEL', 'INFO'))
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
@@ -28,6 +56,10 @@ async def lifespan(app: FastAPI):
|
||||
await init_db()
|
||||
logger.info("Database initialized")
|
||||
|
||||
# Start metrics server
|
||||
metrics_collector.start_metrics_server(8080)
|
||||
logger.info("Metrics server started on port 8080")
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown
|
||||
@@ -44,6 +76,24 @@ app = FastAPI(
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
# Instrument FastAPI with OpenTelemetry
|
||||
FastAPIInstrumentor.instrument_app(app)
|
||||
|
||||
# Instrument httpx for outgoing requests
|
||||
HTTPXClientInstrumentor().instrument()
|
||||
|
||||
# Instrument Redis
|
||||
RedisInstrumentor().instrument()
|
||||
|
||||
# Instrument SQLAlchemy
|
||||
SQLAlchemyInstrumentor().instrument()
|
||||
|
||||
# Initialize metrics collector
|
||||
metrics_collector = MetricsCollector("ai-insights")
|
||||
|
||||
# Add metrics middleware to track HTTP requests
|
||||
add_metrics_middleware(app, metrics_collector)
|
||||
|
||||
# CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
@@ -81,6 +131,15 @@ async def health_check():
|
||||
}
|
||||
|
||||
|
||||
@app.get("/metrics")
|
||||
async def metrics():
|
||||
"""Prometheus metrics endpoint"""
|
||||
return Response(
|
||||
content=metrics_collector.get_metrics(),
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
|
||||
@@ -29,6 +29,16 @@ pytz==2023.3
|
||||
# Logging
|
||||
structlog==23.2.0
|
||||
|
||||
# Monitoring and Observability
|
||||
prometheus-client==0.23.1
|
||||
opentelemetry-api==1.27.0
|
||||
opentelemetry-sdk==1.27.0
|
||||
opentelemetry-instrumentation-fastapi==0.48b0
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.27.0
|
||||
opentelemetry-instrumentation-httpx==0.48b0
|
||||
opentelemetry-instrumentation-redis==0.48b0
|
||||
opentelemetry-instrumentation-sqlalchemy==0.48b0
|
||||
|
||||
# Machine Learning (for confidence scoring and impact estimation)
|
||||
numpy==1.26.2
|
||||
pandas==2.1.3
|
||||
|
||||
@@ -4,25 +4,52 @@ Alert Processor Service v2.0
|
||||
Main FastAPI application with RabbitMQ consumer lifecycle management.
|
||||
"""
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi import FastAPI, Response
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from contextlib import asynccontextmanager
|
||||
import structlog
|
||||
import os
|
||||
|
||||
from app.core.config import settings
|
||||
from app.consumer.event_consumer import EventConsumer
|
||||
from app.api import alerts, sse
|
||||
from shared.redis_utils import initialize_redis, close_redis
|
||||
from shared.monitoring.logging import setup_logging
|
||||
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
|
||||
|
||||
# Configure structured logging
|
||||
structlog.configure(
|
||||
processors=[
|
||||
structlog.processors.TimeStamper(fmt="iso"),
|
||||
structlog.processors.add_log_level,
|
||||
structlog.processors.JSONRenderer()
|
||||
]
|
||||
)
|
||||
# OpenTelemetry imports
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
|
||||
# Configure OpenTelemetry tracing
|
||||
def setup_tracing(service_name: str = "alert-processor"):
|
||||
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
|
||||
resource = Resource.create({"service.name": service_name})
|
||||
|
||||
otlp_exporter = OTLPSpanExporter(
|
||||
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"),
|
||||
insecure=True
|
||||
)
|
||||
|
||||
provider = TracerProvider(resource=resource)
|
||||
processor = BatchSpanProcessor(otlp_exporter)
|
||||
provider.add_span_processor(processor)
|
||||
trace.set_tracer_provider(provider)
|
||||
|
||||
return provider
|
||||
|
||||
# Initialize tracing
|
||||
tracer_provider = setup_tracing("alert-processor")
|
||||
|
||||
# Setup logging
|
||||
setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO'))
|
||||
logger = structlog.get_logger()
|
||||
|
||||
# Global consumer instance
|
||||
@@ -54,6 +81,10 @@ async def lifespan(app: FastAPI):
|
||||
consumer = EventConsumer()
|
||||
await consumer.start()
|
||||
logger.info("alert_processor_started")
|
||||
|
||||
# Start metrics server
|
||||
metrics_collector.start_metrics_server(8080)
|
||||
logger.info("Metrics server started on port 8080")
|
||||
except Exception as e:
|
||||
logger.error("alert_processor_startup_failed", error=str(e))
|
||||
raise
|
||||
@@ -79,6 +110,24 @@ app = FastAPI(
|
||||
debug=settings.DEBUG
|
||||
)
|
||||
|
||||
# Instrument FastAPI with OpenTelemetry
|
||||
FastAPIInstrumentor.instrument_app(app)
|
||||
|
||||
# Instrument httpx for outgoing requests
|
||||
HTTPXClientInstrumentor().instrument()
|
||||
|
||||
# Instrument Redis
|
||||
RedisInstrumentor().instrument()
|
||||
|
||||
# Instrument SQLAlchemy
|
||||
SQLAlchemyInstrumentor().instrument()
|
||||
|
||||
# Initialize metrics collector
|
||||
metrics_collector = MetricsCollector("alert-processor")
|
||||
|
||||
# Add metrics middleware to track HTTP requests
|
||||
add_metrics_middleware(app, metrics_collector)
|
||||
|
||||
# CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
@@ -126,6 +175,15 @@ async def root():
|
||||
}
|
||||
|
||||
|
||||
@app.get("/metrics")
|
||||
async def metrics():
|
||||
"""Prometheus metrics endpoint"""
|
||||
return Response(
|
||||
content=metrics_collector.get_metrics(),
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
|
||||
@@ -32,3 +32,13 @@ python-dateutil==2.8.2
|
||||
|
||||
# Authentication
|
||||
python-jose[cryptography]==3.3.0
|
||||
|
||||
# Monitoring and Observability
|
||||
prometheus-client==0.23.1
|
||||
opentelemetry-api==1.27.0
|
||||
opentelemetry-sdk==1.27.0
|
||||
opentelemetry-instrumentation-fastapi==0.48b0
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.27.0
|
||||
opentelemetry-instrumentation-httpx==0.48b0
|
||||
opentelemetry-instrumentation-redis==0.48b0
|
||||
opentelemetry-instrumentation-sqlalchemy==0.48b0
|
||||
|
||||
@@ -3,16 +3,51 @@ Demo Session Service - Main Application
|
||||
Manages isolated demo sessions with ephemeral data
|
||||
"""
|
||||
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi import FastAPI, Request, Response
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
import structlog
|
||||
from contextlib import asynccontextmanager
|
||||
import os
|
||||
|
||||
from app.core import settings, DatabaseManager
|
||||
from app.api import demo_sessions, demo_accounts, demo_operations, internal
|
||||
from shared.redis_utils import initialize_redis, close_redis
|
||||
from shared.monitoring.logging import setup_logging
|
||||
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
|
||||
|
||||
# OpenTelemetry imports
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
|
||||
# Configure OpenTelemetry tracing
|
||||
def setup_tracing(service_name: str = "demo-session"):
|
||||
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
|
||||
resource = Resource.create({"service.name": service_name})
|
||||
|
||||
otlp_exporter = OTLPSpanExporter(
|
||||
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"),
|
||||
insecure=True
|
||||
)
|
||||
|
||||
provider = TracerProvider(resource=resource)
|
||||
processor = BatchSpanProcessor(otlp_exporter)
|
||||
provider.add_span_processor(processor)
|
||||
trace.set_tracer_provider(provider)
|
||||
|
||||
return provider
|
||||
|
||||
# Initialize tracing
|
||||
tracer_provider = setup_tracing("demo-session")
|
||||
|
||||
# Setup logging
|
||||
setup_logging("demo-session", getattr(settings, 'LOG_LEVEL', 'INFO'))
|
||||
logger = structlog.get_logger()
|
||||
|
||||
# Initialize database
|
||||
@@ -34,6 +69,10 @@ async def lifespan(app: FastAPI):
|
||||
max_connections=50
|
||||
)
|
||||
|
||||
# Start metrics server
|
||||
metrics_collector.start_metrics_server(8080)
|
||||
logger.info("Metrics server started on port 8080")
|
||||
|
||||
logger.info("Demo Session Service started successfully")
|
||||
|
||||
yield
|
||||
@@ -52,6 +91,21 @@ app = FastAPI(
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
# Instrument FastAPI with OpenTelemetry
|
||||
FastAPIInstrumentor.instrument_app(app)
|
||||
|
||||
# Instrument httpx for outgoing requests
|
||||
HTTPXClientInstrumentor().instrument()
|
||||
|
||||
# Instrument Redis
|
||||
RedisInstrumentor().instrument()
|
||||
|
||||
# Initialize metrics collector
|
||||
metrics_collector = MetricsCollector("demo-session")
|
||||
|
||||
# Add metrics middleware to track HTTP requests
|
||||
add_metrics_middleware(app, metrics_collector)
|
||||
|
||||
# CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
@@ -110,6 +164,15 @@ async def health():
|
||||
}
|
||||
|
||||
|
||||
@app.get("/metrics")
|
||||
async def metrics():
|
||||
"""Prometheus metrics endpoint"""
|
||||
return Response(
|
||||
content=metrics_collector.get_metrics(),
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(
|
||||
|
||||
@@ -18,3 +18,11 @@ prometheus-client==0.23.1
|
||||
aio-pika==9.4.3
|
||||
email-validator==2.2.0
|
||||
pytz==2024.2
|
||||
|
||||
# OpenTelemetry for distributed tracing
|
||||
opentelemetry-api==1.27.0
|
||||
opentelemetry-sdk==1.27.0
|
||||
opentelemetry-instrumentation-fastapi==0.48b0
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.27.0
|
||||
opentelemetry-instrumentation-httpx==0.48b0
|
||||
opentelemetry-instrumentation-redis==0.48b0
|
||||
|
||||
Reference in New Issue
Block a user