Update monitoring packages to latest versions

- Updated all OpenTelemetry packages to latest versions:
  - opentelemetry-api: 1.27.0 → 1.39.1
  - opentelemetry-sdk: 1.27.0 → 1.39.1
  - opentelemetry-exporter-otlp-proto-grpc: 1.27.0 → 1.39.1
  - opentelemetry-exporter-otlp-proto-http: 1.27.0 → 1.39.1
  - opentelemetry-instrumentation-fastapi: 0.48b0 → 0.60b1
  - opentelemetry-instrumentation-httpx: 0.48b0 → 0.60b1
  - opentelemetry-instrumentation-redis: 0.48b0 → 0.60b1
  - opentelemetry-instrumentation-sqlalchemy: 0.48b0 → 0.60b1

- Removed prometheus-client==0.23.1 from all services
- Unified all services to use the same monitoring package versions

Generated by Mistral Vibe.
Co-Authored-By: Mistral Vibe <vibe@mistral.ai>
This commit is contained in:
Urtzi Alfaro
2026-01-08 19:25:52 +01:00
parent dfb7e4b237
commit 29d19087f1
129 changed files with 5718 additions and 1821 deletions

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: ai-insights-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "ai-insights-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: auth-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -93,6 +95,21 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "auth-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: ai-insights-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: alert-processor-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: auth-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: distribution-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: external-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: forecasting-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: inventory-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: notification-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: orchestrator-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: orders-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: pos-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: {{SERVICE_NAME}}-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
containers:
- name: postgres
image: postgres:17-alpine
@@ -121,4 +123,4 @@ spec:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storage: 1Gi

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: procurement-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: production-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: rabbitmq
app.kubernetes.io/component: message-broker
spec:
imagePullSecrets:
- name: dockerhub-creds
containers:
- name: rabbitmq
image: rabbitmq:4.1-management-alpine
@@ -120,4 +122,4 @@ spec:
- ReadWriteOnce
resources:
requests:
storage: 2Gi
storage: 2Gi

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: recipes-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: redis
app.kubernetes.io/component: cache
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 999 # redis group
initContainers:
@@ -166,4 +168,4 @@ spec:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storage: 1Gi

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: sales-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: suppliers-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: tenant-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: training-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -16,6 +16,8 @@ spec:
app: distribution-service
tier: backend
spec:
imagePullSecrets:
- name: dockerhub-creds
containers:
- name: distribution-service
image: bakery/distribution-service:latest
@@ -58,6 +60,25 @@ spec:
value: "30"
- name: HTTP_RETRIES
value: "3"
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "distribution-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
livenessProbe:
httpGet:
path: /health
@@ -107,4 +128,4 @@ spec:
port: 8000
targetPort: 8000
name: http
type: ClusterIP
type: ClusterIP

View File

@@ -23,6 +23,8 @@ spec:
app.kubernetes.io/component: microservice
version: "2.0"
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -85,6 +87,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "external-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: forecasting-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "forecasting-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: frontend
app.kubernetes.io/component: frontend
spec:
imagePullSecrets:
- name: dockerhub-creds
containers:
- name: frontend
image: bakery/dashboard:latest

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: gateway
app.kubernetes.io/component: gateway
spec:
imagePullSecrets:
- name: dockerhub-creds
containers:
- name: gateway
image: bakery/gateway:latest

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: inventory-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "inventory-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -1,501 +0,0 @@
# Bakery IA - Production Monitoring Stack
This directory contains the complete production-ready monitoring infrastructure for the Bakery IA platform.
## 📊 Components
### Core Monitoring
- **Prometheus v3.0.1** - Time-series metrics database (2 replicas with HA)
- **Grafana v12.3.0** - Visualization and dashboarding
- **AlertManager v0.27.0** - Alert routing and notification (3 replicas with HA)
### Distributed Tracing
- **Jaeger v1.51** - Distributed tracing with persistent storage
### Exporters
- **PostgreSQL Exporter v0.15.0** - Database metrics and health
- **Node Exporter v1.7.0** - Infrastructure and OS-level metrics (DaemonSet)
## 🚀 Deployment
### Prerequisites
1. Kubernetes cluster (v1.24+)
2. kubectl configured
3. kustomize (v4.0+) or kubectl with kustomize support
4. Storage class available for PersistentVolumeClaims
### Production Deployment
```bash
# 1. Update secrets with production values
kubectl create secret generic grafana-admin \
--from-literal=admin-user=admin \
--from-literal=admin-password=$(openssl rand -base64 32) \
--namespace monitoring --dry-run=client -o yaml > secrets.yaml
# 2. Update AlertManager SMTP credentials
kubectl create secret generic alertmanager-secrets \
--from-literal=smtp-host="smtp.gmail.com:587" \
--from-literal=smtp-username="alerts@yourdomain.com" \
--from-literal=smtp-password="YOUR_SMTP_PASSWORD" \
--from-literal=smtp-from="alerts@yourdomain.com" \
--from-literal=slack-webhook-url="https://hooks.slack.com/services/YOUR/WEBHOOK/URL" \
--namespace monitoring --dry-run=client -o yaml >> secrets.yaml
# 3. Update PostgreSQL exporter connection string
kubectl create secret generic postgres-exporter \
--from-literal=data-source-name="postgresql://user:password@postgres.bakery-ia:5432/bakery?sslmode=require" \
--namespace monitoring --dry-run=client -o yaml >> secrets.yaml
# 4. Deploy monitoring stack
kubectl apply -k infrastructure/kubernetes/overlays/prod
# 5. Verify deployment
kubectl get pods -n monitoring
kubectl get pvc -n monitoring
```
### Local Development Deployment
For local Kind clusters, monitoring is disabled by default to save resources. To enable:
```bash
# Uncomment monitoring in overlays/dev/kustomization.yaml
# Then apply:
kubectl apply -k infrastructure/kubernetes/overlays/dev
```
## 🔐 Security Configuration
### Important Security Notes
⚠️ **NEVER commit real secrets to Git!**
The `secrets.yaml` file contains placeholder values. In production, use one of:
1. **Sealed Secrets** (Recommended)
```bash
kubectl apply -f https://github.com/bitnami-labs/sealed-secrets/releases/download/v0.24.0/controller.yaml
kubeseal --format=yaml < secrets.yaml > sealed-secrets.yaml
```
2. **External Secrets Operator**
```bash
helm install external-secrets external-secrets/external-secrets -n external-secrets
```
3. **Cloud Provider Secrets**
- AWS Secrets Manager
- GCP Secret Manager
- Azure Key Vault
### Grafana Admin Password
Change the default password immediately:
```bash
# Generate strong password
NEW_PASSWORD=$(openssl rand -base64 32)
# Update secret
kubectl patch secret grafana-admin -n monitoring \
-p="{\"data\":{\"admin-password\":\"$(echo -n $NEW_PASSWORD | base64)\"}}"
# Restart Grafana
kubectl rollout restart deployment grafana -n monitoring
```
## 📈 Accessing Monitoring Services
### Via Ingress (Production)
```
https://monitoring.yourdomain.com/grafana
https://monitoring.yourdomain.com/prometheus
https://monitoring.yourdomain.com/alertmanager
https://monitoring.yourdomain.com/jaeger
```
### Via Port Forwarding (Development)
```bash
# Grafana
kubectl port-forward -n monitoring svc/grafana 3000:3000
# Prometheus
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
# AlertManager
kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093
# Jaeger
kubectl port-forward -n monitoring svc/jaeger-query 16686:16686
```
Then access:
- Grafana: http://localhost:3000
- Prometheus: http://localhost:9090
- AlertManager: http://localhost:9093
- Jaeger: http://localhost:16686
## 📊 Grafana Dashboards
### Pre-configured Dashboards
1. **Gateway Metrics** - API gateway performance
- Request rate by endpoint
- P95 latency
- Error rates
- Authentication metrics
2. **Services Overview** - Microservices health
- Request rate by service
- P99 latency
- Error rates by service
- Service health status
3. **Circuit Breakers** - Resilience patterns
- Circuit breaker states
- Trip rates
- Rejected requests
4. **PostgreSQL Monitoring** - Database health
- Connections, transactions, cache hit ratio
- Slow queries, locks, replication lag
5. **Node Metrics** - Infrastructure monitoring
- CPU, memory, disk, network per node
6. **AlertManager** - Alert management
- Active alerts, firing rate, notifications
7. **Business Metrics** - KPIs
- Service performance, tenant activity, ML metrics
### Creating Custom Dashboards
1. Login to Grafana (admin/[your-password])
2. Click "+ → Dashboard"
3. Add panels with Prometheus queries
4. Save dashboard
5. Export JSON and add to `grafana-dashboards.yaml`
## 🚨 Alert Configuration
### Alert Rules
Alert rules are defined in `alert-rules.yaml` and organized by category:
- **bakery_services** - Service health, errors, latency, memory
- **bakery_business** - Training jobs, ML accuracy, API limits
- **alert_system_health** - Alert system components, RabbitMQ, Redis
- **alert_system_performance** - Processing errors, delivery failures
- **alert_system_business** - Alert volume, response times
- **alert_system_capacity** - Queue sizes, storage performance
- **alert_system_critical** - System failures, data loss
- **monitoring_health** - Prometheus, AlertManager self-monitoring
### Alert Routing
Alerts are routed based on:
- **Severity** (critical, warning, info)
- **Component** (alert-system, database, infrastructure)
- **Service** name
### Notification Channels
Configure in `alertmanager.yaml`:
1. **Email** (default)
- critical-alerts@yourdomain.com
- oncall@yourdomain.com
2. **Slack** (optional, commented out)
- Update slack-webhook-url in secrets
- Uncomment slack_configs in alertmanager.yaml
3. **PagerDuty** (add if needed)
```yaml
pagerduty_configs:
- routing_key: YOUR_ROUTING_KEY
severity: '{{ .Labels.severity }}'
```
### Testing Alerts
```bash
# Fire a test alert
kubectl run test-alert --image=busybox -n bakery-ia --restart=Never -- sleep 3600
# Check alert in Prometheus
# Navigate to http://localhost:9090/alerts
# Check AlertManager
# Navigate to http://localhost:9093
```
## 🔍 Troubleshooting
### Prometheus Issues
```bash
# Check Prometheus logs
kubectl logs -n monitoring prometheus-0 -f
# Check Prometheus targets
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
# Visit http://localhost:9090/targets
# Check Prometheus configuration
kubectl get configmap prometheus-config -n monitoring -o yaml
```
### AlertManager Issues
```bash
# Check AlertManager logs
kubectl logs -n monitoring alertmanager-0 -f
# Check AlertManager configuration
kubectl exec -n monitoring alertmanager-0 -- cat /etc/alertmanager/alertmanager.yml
# Test SMTP connection
kubectl exec -n monitoring alertmanager-0 -- \
wget --spider --server-response --timeout=10 smtp://smtp.gmail.com:587
```
### Grafana Issues
```bash
# Check Grafana logs
kubectl logs -n monitoring deployment/grafana -f
# Reset Grafana admin password
kubectl exec -n monitoring deployment/grafana -- \
grafana-cli admin reset-admin-password NEW_PASSWORD
```
### PostgreSQL Exporter Issues
```bash
# Check exporter logs
kubectl logs -n monitoring deployment/postgres-exporter -f
# Test database connection
kubectl exec -n monitoring deployment/postgres-exporter -- \
wget -O- http://localhost:9187/metrics | grep pg_up
```
### Node Exporter Issues
```bash
# Check node exporter on specific node
kubectl logs -n monitoring daemonset/node-exporter --selector=kubernetes.io/hostname=NODE_NAME -f
# Check metrics endpoint
kubectl exec -n monitoring daemonset/node-exporter -- \
wget -O- http://localhost:9100/metrics | head -n 20
```
## 📏 Resource Requirements
### Minimum Requirements (Development)
- CPU: 2 cores
- Memory: 4Gi
- Storage: 30Gi
### Recommended Requirements (Production)
- CPU: 6-8 cores
- Memory: 16Gi
- Storage: 100Gi
### Component Resource Allocation
| Component | Replicas | CPU Request | Memory Request | CPU Limit | Memory Limit |
|-----------|----------|-------------|----------------|-----------|--------------|
| Prometheus | 2 | 500m | 1Gi | 1 | 2Gi |
| AlertManager | 3 | 100m | 128Mi | 500m | 256Mi |
| Grafana | 1 | 100m | 256Mi | 500m | 512Mi |
| Postgres Exporter | 1 | 50m | 64Mi | 200m | 128Mi |
| Node Exporter | 1/node | 50m | 64Mi | 200m | 128Mi |
| Jaeger | 1 | 250m | 512Mi | 500m | 1Gi |
## 🔄 High Availability
### Prometheus HA
- 2 replicas in StatefulSet
- Each has independent storage (volumeClaimTemplates)
- Anti-affinity to spread across nodes
- Both scrape the same targets independently
- Use Thanos for long-term storage and global query view (future enhancement)
### AlertManager HA
- 3 replicas in StatefulSet
- Clustered mode (gossip protocol)
- Automatic leader election
- Alert deduplication across instances
- Anti-affinity to spread across nodes
### PodDisruptionBudgets
Ensure minimum availability during:
- Node maintenance
- Cluster upgrades
- Rolling updates
```yaml
Prometheus: minAvailable=1 (out of 2)
AlertManager: minAvailable=2 (out of 3)
Grafana: minAvailable=1 (out of 1)
```
## 📊 Metrics Reference
### Application Metrics (from services)
```promql
# HTTP request rate
rate(http_requests_total[5m])
# HTTP error rate
rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])
# Request latency (P95)
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
# Active connections
active_connections
```
### PostgreSQL Metrics
```promql
# Active connections
pg_stat_database_numbackends
# Transaction rate
rate(pg_stat_database_xact_commit[5m])
# Cache hit ratio
rate(pg_stat_database_blks_hit[5m]) /
(rate(pg_stat_database_blks_hit[5m]) + rate(pg_stat_database_blks_read[5m]))
# Replication lag
pg_replication_lag_seconds
```
### Node Metrics
```promql
# CPU usage
100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Memory usage
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100
# Disk I/O
rate(node_disk_read_bytes_total[5m])
rate(node_disk_written_bytes_total[5m])
# Network traffic
rate(node_network_receive_bytes_total[5m])
rate(node_network_transmit_bytes_total[5m])
```
## 🔗 Distributed Tracing
### Jaeger Configuration
Services automatically send traces when `JAEGER_ENABLED=true`:
```yaml
# In prod-configmap.yaml
JAEGER_ENABLED: "true"
JAEGER_AGENT_HOST: "jaeger-agent.monitoring.svc.cluster.local"
JAEGER_AGENT_PORT: "6831"
```
### Viewing Traces
1. Access Jaeger UI: https://monitoring.yourdomain.com/jaeger
2. Select service from dropdown
3. Click "Find Traces"
4. Explore trace details, spans, and timing
### Trace Sampling
Current sampling: 100% (all traces collected)
For high-traffic production:
```yaml
# Adjust in shared/monitoring/tracing.py
JAEGER_SAMPLE_RATE: "0.1" # 10% of traces
```
## 📚 Additional Resources
- [Prometheus Documentation](https://prometheus.io/docs/)
- [Grafana Documentation](https://grafana.com/docs/)
- [AlertManager Documentation](https://prometheus.io/docs/alerting/latest/alertmanager/)
- [Jaeger Documentation](https://www.jaegertracing.io/docs/)
- [PostgreSQL Exporter](https://github.com/prometheus-community/postgres_exporter)
- [Node Exporter](https://github.com/prometheus/node_exporter)
## 🆘 Support
For monitoring issues:
1. Check component logs (see Troubleshooting section)
2. Verify Prometheus targets are UP
3. Check AlertManager configuration and routing
4. Review resource usage and quotas
5. Contact platform team: platform-team@yourdomain.com
## 🔄 Maintenance
### Regular Tasks
**Daily:**
- Review critical alerts
- Check service health dashboards
**Weekly:**
- Review alert noise and adjust thresholds
- Check storage usage for Prometheus and Jaeger
- Review slow queries in PostgreSQL dashboard
**Monthly:**
- Update dashboard with new metrics
- Review and update alert runbooks
- Capacity planning based on trends
### Backup and Recovery
**Prometheus Data:**
```bash
# Backup Prometheus data
kubectl exec -n monitoring prometheus-0 -- tar czf /tmp/prometheus-backup.tar.gz /prometheus
kubectl cp monitoring/prometheus-0:/tmp/prometheus-backup.tar.gz ./prometheus-backup.tar.gz
# Restore (stop Prometheus first)
kubectl cp ./prometheus-backup.tar.gz monitoring/prometheus-0:/tmp/
kubectl exec -n monitoring prometheus-0 -- tar xzf /tmp/prometheus-backup.tar.gz -C /
```
**Grafana Dashboards:**
```bash
# Export all dashboards via API
curl -u admin:password http://localhost:3000/api/search | \
jq -r '.[] | .uid' | \
xargs -I{} curl -u admin:password http://localhost:3000/api/dashboards/uid/{} > dashboards-backup.json
```
## 📝 Version History
- **v1.0.0** (2026-01-07) - Initial production-ready monitoring stack
- Prometheus v3.0.1 with HA
- AlertManager v0.27.0 with clustering
- Grafana v12.3.0 with 7 dashboards
- PostgreSQL and Node exporters
- 50+ alert rules
- Comprehensive documentation

View File

@@ -1,20 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
# Minimal Monitoring Infrastructure
# SigNoz is now managed via Helm in the 'signoz' namespace
# This kustomization only maintains:
# - Namespace for legacy resources (if needed)
# - Node exporter for infrastructure metrics
# - PostgreSQL exporter for database metrics
# - Optional OTEL collector (can be disabled if using SigNoz's built-in collector)
resources:
- namespace.yaml
- secrets.yaml
# Exporters for metrics collection
- node-exporter.yaml
- postgres-exporter.yaml
# Optional: Keep OTEL collector or use SigNoz's built-in one
# Uncomment if you want a dedicated OTEL collector in monitoring namespace
# - otel-collector.yaml

View File

@@ -1,7 +0,0 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
labels:
name: monitoring
app.kubernetes.io/part-of: bakery-ia

View File

@@ -1,103 +0,0 @@
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: monitoring
labels:
app: node-exporter
spec:
selector:
matchLabels:
app: node-exporter
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
template:
metadata:
labels:
app: node-exporter
spec:
hostNetwork: true
hostPID: true
nodeSelector:
kubernetes.io/os: linux
tolerations:
# Run on all nodes including master
- operator: Exists
effect: NoSchedule
containers:
- name: node-exporter
image: quay.io/prometheus/node-exporter:v1.7.0
args:
- '--path.sysfs=/host/sys'
- '--path.rootfs=/host/root'
- '--path.procfs=/host/proc'
- '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)'
- '--collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$'
- '--collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$'
- '--collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$'
- '--web.listen-address=:9100'
ports:
- containerPort: 9100
protocol: TCP
name: metrics
resources:
requests:
memory: "64Mi"
cpu: "50m"
limits:
memory: "128Mi"
cpu: "200m"
volumeMounts:
- name: sys
mountPath: /host/sys
mountPropagation: HostToContainer
readOnly: true
- name: root
mountPath: /host/root
mountPropagation: HostToContainer
readOnly: true
- name: proc
mountPath: /host/proc
mountPropagation: HostToContainer
readOnly: true
securityContext:
runAsNonRoot: true
runAsUser: 65534
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
volumes:
- name: sys
hostPath:
path: /sys
- name: root
hostPath:
path: /
- name: proc
hostPath:
path: /proc
---
apiVersion: v1
kind: Service
metadata:
name: node-exporter
namespace: monitoring
labels:
app: node-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
spec:
clusterIP: None
ports:
- name: metrics
port: 9100
protocol: TCP
targetPort: 9100
selector:
app: node-exporter

View File

@@ -1,167 +0,0 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: otel-collector-config
namespace: monitoring
data:
otel-collector-config.yaml: |
extensions:
health_check:
endpoint: 0.0.0.0:13133
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 10s
send_batch_size: 1024
# Memory limiter to prevent OOM
memory_limiter:
check_interval: 1s
limit_mib: 512
spike_limit_mib: 128
exporters:
# Export metrics to Prometheus
prometheus:
endpoint: "0.0.0.0:8889"
namespace: otelcol
const_labels:
source: otel-collector
# Export to SigNoz
otlp/signoz:
endpoint: "signoz-query-service.monitoring.svc.cluster.local:8080"
tls:
insecure: true
# Logging exporter for debugging traces and logs
logging:
loglevel: info
sampling_initial: 5
sampling_thereafter: 200
service:
extensions: [health_check]
pipelines:
# Traces pipeline: receive -> process -> export to SigNoz
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/signoz, logging]
# Metrics pipeline: receive -> process -> export to both Prometheus and SigNoz
metrics:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [prometheus, otlp/signoz]
# Logs pipeline: receive -> process -> export to SigNoz
logs:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/signoz, logging]
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: otel-collector
namespace: monitoring
labels:
app: otel-collector
spec:
replicas: 1
selector:
matchLabels:
app: otel-collector
template:
metadata:
labels:
app: otel-collector
spec:
containers:
- name: otel-collector
image: otel/opentelemetry-collector-contrib:0.91.0
args:
- --config=/conf/otel-collector-config.yaml
ports:
- containerPort: 4317
protocol: TCP
name: otlp-grpc
- containerPort: 4318
protocol: TCP
name: otlp-http
- containerPort: 8889
protocol: TCP
name: prometheus
- containerPort: 13133
protocol: TCP
name: health-check
volumeMounts:
- name: otel-collector-config
mountPath: /conf
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "500m"
livenessProbe:
httpGet:
path: /
port: 13133
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /
port: 13133
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: otel-collector-config
configMap:
name: otel-collector-config
items:
- key: otel-collector-config.yaml
path: otel-collector-config.yaml
---
apiVersion: v1
kind: Service
metadata:
name: otel-collector
namespace: monitoring
labels:
app: otel-collector
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8889"
prometheus.io/path: "/metrics"
spec:
type: ClusterIP
ports:
- port: 4317
targetPort: 4317
protocol: TCP
name: otlp-grpc
- port: 4318
targetPort: 4318
protocol: TCP
name: otlp-http
- port: 8889
targetPort: 8889
protocol: TCP
name: prometheus
selector:
app: otel-collector

View File

@@ -1,306 +0,0 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: postgres-exporter
namespace: monitoring
labels:
app: postgres-exporter
spec:
replicas: 1
selector:
matchLabels:
app: postgres-exporter
template:
metadata:
labels:
app: postgres-exporter
spec:
containers:
- name: postgres-exporter
image: prometheuscommunity/postgres-exporter:v0.15.0
ports:
- containerPort: 9187
name: metrics
env:
- name: DATA_SOURCE_NAME
valueFrom:
secretKeyRef:
name: postgres-exporter
key: data-source-name
# Enable extended metrics
- name: PG_EXPORTER_EXTEND_QUERY_PATH
value: "/etc/postgres-exporter/queries.yaml"
# Disable default metrics (we'll use custom ones)
- name: PG_EXPORTER_DISABLE_DEFAULT_METRICS
value: "false"
# Disable settings metrics (can be noisy)
- name: PG_EXPORTER_DISABLE_SETTINGS_METRICS
value: "false"
volumeMounts:
- name: queries
mountPath: /etc/postgres-exporter
resources:
requests:
memory: "64Mi"
cpu: "50m"
limits:
memory: "128Mi"
cpu: "200m"
livenessProbe:
httpGet:
path: /
port: 9187
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /
port: 9187
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: queries
configMap:
name: postgres-exporter-queries
---
apiVersion: v1
kind: ConfigMap
metadata:
name: postgres-exporter-queries
namespace: monitoring
data:
queries.yaml: |
# Custom PostgreSQL queries for bakery-ia metrics
pg_database:
query: |
SELECT
datname,
numbackends as connections,
xact_commit as transactions_committed,
xact_rollback as transactions_rolled_back,
blks_read as blocks_read,
blks_hit as blocks_hit,
tup_returned as tuples_returned,
tup_fetched as tuples_fetched,
tup_inserted as tuples_inserted,
tup_updated as tuples_updated,
tup_deleted as tuples_deleted,
conflicts as conflicts,
temp_files as temp_files,
temp_bytes as temp_bytes,
deadlocks as deadlocks
FROM pg_stat_database
WHERE datname NOT IN ('template0', 'template1', 'postgres')
metrics:
- datname:
usage: "LABEL"
description: "Name of the database"
- connections:
usage: "GAUGE"
description: "Number of backends currently connected to this database"
- transactions_committed:
usage: "COUNTER"
description: "Number of transactions in this database that have been committed"
- transactions_rolled_back:
usage: "COUNTER"
description: "Number of transactions in this database that have been rolled back"
- blocks_read:
usage: "COUNTER"
description: "Number of disk blocks read in this database"
- blocks_hit:
usage: "COUNTER"
description: "Number of times disk blocks were found in the buffer cache"
- tuples_returned:
usage: "COUNTER"
description: "Number of rows returned by queries in this database"
- tuples_fetched:
usage: "COUNTER"
description: "Number of rows fetched by queries in this database"
- tuples_inserted:
usage: "COUNTER"
description: "Number of rows inserted by queries in this database"
- tuples_updated:
usage: "COUNTER"
description: "Number of rows updated by queries in this database"
- tuples_deleted:
usage: "COUNTER"
description: "Number of rows deleted by queries in this database"
- conflicts:
usage: "COUNTER"
description: "Number of queries canceled due to conflicts with recovery"
- temp_files:
usage: "COUNTER"
description: "Number of temporary files created by queries"
- temp_bytes:
usage: "COUNTER"
description: "Total amount of data written to temporary files by queries"
- deadlocks:
usage: "COUNTER"
description: "Number of deadlocks detected in this database"
pg_replication:
query: |
SELECT
CASE WHEN pg_is_in_recovery() THEN 1 ELSE 0 END as is_replica,
EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::INT as lag_seconds
metrics:
- is_replica:
usage: "GAUGE"
description: "1 if this is a replica, 0 if primary"
- lag_seconds:
usage: "GAUGE"
description: "Replication lag in seconds (only on replicas)"
pg_slow_queries:
query: |
SELECT
datname,
usename,
state,
COUNT(*) as count,
MAX(EXTRACT(EPOCH FROM (now() - query_start))) as max_duration_seconds
FROM pg_stat_activity
WHERE state != 'idle'
AND query NOT LIKE '%pg_stat_activity%'
AND query_start < now() - interval '30 seconds'
GROUP BY datname, usename, state
metrics:
- datname:
usage: "LABEL"
description: "Database name"
- usename:
usage: "LABEL"
description: "User name"
- state:
usage: "LABEL"
description: "Query state"
- count:
usage: "GAUGE"
description: "Number of slow queries"
- max_duration_seconds:
usage: "GAUGE"
description: "Maximum query duration in seconds"
pg_table_stats:
query: |
SELECT
schemaname,
relname,
seq_scan,
seq_tup_read,
idx_scan,
idx_tup_fetch,
n_tup_ins,
n_tup_upd,
n_tup_del,
n_tup_hot_upd,
n_live_tup,
n_dead_tup,
n_mod_since_analyze,
last_vacuum,
last_autovacuum,
last_analyze,
last_autoanalyze
FROM pg_stat_user_tables
WHERE schemaname = 'public'
ORDER BY n_live_tup DESC
LIMIT 20
metrics:
- schemaname:
usage: "LABEL"
description: "Schema name"
- relname:
usage: "LABEL"
description: "Table name"
- seq_scan:
usage: "COUNTER"
description: "Number of sequential scans"
- seq_tup_read:
usage: "COUNTER"
description: "Number of tuples read by sequential scans"
- idx_scan:
usage: "COUNTER"
description: "Number of index scans"
- idx_tup_fetch:
usage: "COUNTER"
description: "Number of tuples fetched by index scans"
- n_tup_ins:
usage: "COUNTER"
description: "Number of tuples inserted"
- n_tup_upd:
usage: "COUNTER"
description: "Number of tuples updated"
- n_tup_del:
usage: "COUNTER"
description: "Number of tuples deleted"
- n_tup_hot_upd:
usage: "COUNTER"
description: "Number of tuples HOT updated"
- n_live_tup:
usage: "GAUGE"
description: "Estimated number of live rows"
- n_dead_tup:
usage: "GAUGE"
description: "Estimated number of dead rows"
- n_mod_since_analyze:
usage: "GAUGE"
description: "Number of rows modified since last analyze"
pg_locks:
query: |
SELECT
mode,
locktype,
COUNT(*) as count
FROM pg_locks
GROUP BY mode, locktype
metrics:
- mode:
usage: "LABEL"
description: "Lock mode"
- locktype:
usage: "LABEL"
description: "Lock type"
- count:
usage: "GAUGE"
description: "Number of locks"
pg_connection_pool:
query: |
SELECT
state,
COUNT(*) as count,
MAX(EXTRACT(EPOCH FROM (now() - state_change))) as max_state_duration_seconds
FROM pg_stat_activity
GROUP BY state
metrics:
- state:
usage: "LABEL"
description: "Connection state"
- count:
usage: "GAUGE"
description: "Number of connections in this state"
- max_state_duration_seconds:
usage: "GAUGE"
description: "Maximum time a connection has been in this state"
---
apiVersion: v1
kind: Service
metadata:
name: postgres-exporter
namespace: monitoring
labels:
app: postgres-exporter
spec:
type: ClusterIP
ports:
- port: 9187
targetPort: 9187
protocol: TCP
name: metrics
selector:
app: postgres-exporter

View File

@@ -1,52 +0,0 @@
---
# NOTE: This file contains example secrets for development.
# For production, use one of the following:
# 1. Sealed Secrets (bitnami-labs/sealed-secrets)
# 2. External Secrets Operator
# 3. HashiCorp Vault
# 4. Cloud provider secret managers (AWS Secrets Manager, GCP Secret Manager, Azure Key Vault)
#
# NEVER commit real production secrets to git!
apiVersion: v1
kind: Secret
metadata:
name: grafana-admin
namespace: monitoring
type: Opaque
stringData:
admin-user: admin
# CHANGE THIS PASSWORD IN PRODUCTION!
# Generate with: openssl rand -base64 32
admin-password: "CHANGE_ME_IN_PRODUCTION"
---
apiVersion: v1
kind: Secret
metadata:
name: alertmanager-secrets
namespace: monitoring
type: Opaque
stringData:
# SMTP configuration for email alerts
# CHANGE THESE VALUES IN PRODUCTION!
smtp-host: "smtp.gmail.com:587"
smtp-username: "alerts@yourdomain.com"
smtp-password: "CHANGE_ME_IN_PRODUCTION"
smtp-from: "alerts@yourdomain.com"
# Slack webhook URL (optional)
slack-webhook-url: "https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
---
apiVersion: v1
kind: Secret
metadata:
name: postgres-exporter
namespace: monitoring
type: Opaque
stringData:
# PostgreSQL connection string
# Format: postgresql://username:password@hostname:port/database?sslmode=disable
# CHANGE THIS IN PRODUCTION!
data-source-name: "postgresql://postgres:postgres@postgres.bakery-ia:5432/bakery?sslmode=disable"

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: notification-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "notification-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: orchestrator-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "orchestrator-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: orders-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "orders-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: pos-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "pos-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: procurement-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "procurement-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: production-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "production-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: recipes-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "recipes-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: sales-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "sales-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: suppliers-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "suppliers-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: tenant-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "tenant-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: training-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "training-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -17,6 +17,8 @@ spec:
labels:
app: demo-cleanup
spec:
imagePullSecrets:
- name: dockerhub-creds
template:
metadata:
labels:

View File

@@ -22,6 +22,8 @@ spec:
app: external-service
job: data-rotation
spec:
imagePullSecrets:
- name: dockerhub-creds
ttlSecondsAfterFinished: 172800
backoffLimit: 2

View File

@@ -19,6 +19,8 @@ spec:
component: background-jobs
service: demo-session
spec:
imagePullSecrets:
- name: dockerhub-creds
containers:
- name: worker
image: bakery/demo-session-service

View File

@@ -20,25 +20,23 @@ metadata:
nginx.ingress.kubernetes.io/upstream-keepalive-timeout: "3600"
# WebSocket upgrade support
nginx.ingress.kubernetes.io/websocket-services: "gateway-service"
# CORS configuration for HTTPS and local development
# CORS configuration for HTTPS
nginx.ingress.kubernetes.io/enable-cors: "true"
nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakery-ia.local,https://api.bakery-ia.local,https://monitoring.bakery-ia.local,https://localhost"
nginx.ingress.kubernetes.io/cors-allow-origin: "https://your-domain.com" # To be overridden in overlays
nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS, PATCH"
nginx.ingress.kubernetes.io/cors-allow-headers: "Content-Type, Authorization, X-Requested-With, Accept, Origin, Cache-Control"
nginx.ingress.kubernetes.io/cors-allow-credentials: "true"
# Cert-manager annotations for automatic certificate issuance
cert-manager.io/cluster-issuer: "letsencrypt-staging"
cert-manager.io/acme-challenge-type: http01
# Using issuer appropriate for environment
cert-manager.io/cluster-issuer: "letsencrypt-prod" # To be overridden in dev overlay
spec:
ingressClassName: nginx
tls:
- hosts:
- bakery-ia.local
- api.bakery-ia.local
- monitoring.bakery-ia.local
secretName: bakery-ia-tls-cert
- your-domain.com # To be overridden in overlays
secretName: bakery-tls-cert # To be overridden in overlays
rules:
- host: bakery-ia.local
- host: your-domain.com # To be overridden in overlays
http:
paths:
- path: /
@@ -55,7 +53,7 @@ spec:
name: gateway-service
port:
number: 8000
- host: api.bakery-ia.local
- host: api.your-domain.com # To be overridden in overlays
http:
paths:
- path: /
@@ -65,20 +63,22 @@ spec:
name: gateway-service
port:
number: 8000
- host: monitoring.bakery-ia.local
- host: monitoring.your-domain.com # To be overridden in overlays
http:
paths:
- path: /grafana
pathType: Prefix
# SigNoz Frontend UI and API (consolidated in newer versions)
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: grafana-service
name: signoz
port:
number: 3000
- path: /prometheus
pathType: Prefix
number: 8080
# SigNoz API endpoints
- path: /signoz-api(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: prometheus-service
name: signoz
port:
number: 9090
number: 8080

View File

@@ -17,6 +17,8 @@ spec:
app: external-service
job: data-init
spec:
imagePullSecrets:
- name: dockerhub-creds
restartPolicy: OnFailure
initContainers:

View File

@@ -15,6 +15,8 @@ spec:
app.kubernetes.io/name: nominatim-init
app.kubernetes.io/component: data-init
spec:
imagePullSecrets:
- name: dockerhub-creds
restartPolicy: OnFailure
containers:
- name: nominatim-import

View File

@@ -66,6 +66,10 @@ resources:
# Persistent storage
- components/volumes/model-storage-pvc.yaml
# Cert manager cluster issuers
- components/cert-manager/cluster-issuer-staging.yaml
- components/cert-manager/local-ca-issuer.yaml
# Database services
- components/databases/auth-db.yaml
- components/databases/tenant-db.yaml

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: ai-insights-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: alert-processor-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: auth-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -29,4 +29,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: demo-seed-sa
namespace: bakery-ia
namespace: bakery-ia

View File

@@ -15,6 +15,8 @@ spec:
app.kubernetes.io/name: demo-session-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: distribution-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: external-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: forecasting-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: inventory-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: notification-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: orchestrator-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: orders-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: pos-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: procurement-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: production-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: recipes-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: sales-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: suppliers-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: tenant-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: tenant-seed-pilot-coupon
app.kubernetes.io/component: seed
spec:
imagePullSecrets:
- name: dockerhub-creds
serviceAccountName: demo-seed-sa
initContainers:
- name: wait-for-tenant-migration

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: training-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine