diff --git a/Tiltfile b/Tiltfile index 603dc0b7..aa1724e0 100644 --- a/Tiltfile +++ b/Tiltfile @@ -402,7 +402,7 @@ local_resource( echo "" echo "📈 SigNoz Access Information:" - echo " URL: https://monitoring.bakery-ia.local/signoz" + echo " URL: https://monitoring.bakery-ia.local" echo " Username: admin" echo " Password: admin" echo "" @@ -445,7 +445,7 @@ local_resource( if [ "$READY_PODS" -eq "$TOTAL_PODS" ]; then echo "✅ All SigNoz pods are running!" echo "" - echo "Access SigNoz at: https://monitoring.bakery-ia.local/signoz" + echo "Access SigNoz at: https://monitoring.bakery-ia.local" echo "Credentials: admin / admin" else echo "⏳ Waiting for pods to become ready..." @@ -687,7 +687,7 @@ Access your application: SigNoz (Unified Observability): Deploy via Tilt: Trigger 'signoz-deployment' resource Manual deploy: ./infrastructure/helm/deploy-signoz.sh dev - Access (if deployed): https://monitoring.bakery-ia.local/signoz + Access (if deployed): https://monitoring.bakery-ia.local Username: admin Password: admin diff --git a/docs/DATABASE_MONITORING.md b/docs/DATABASE_MONITORING.md index dda19b4c..32ae323a 100644 --- a/docs/DATABASE_MONITORING.md +++ b/docs/DATABASE_MONITORING.md @@ -162,7 +162,7 @@ data: exporters: # Send to SigNoz otlphttp: - endpoint: http://signoz-otel-collector.signoz.svc.cluster.local:4318 + endpoint: http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318 tls: insecure: true @@ -374,7 +374,7 @@ processors: exporters: otlphttp/logs: - endpoint: http://signoz-otel-collector.signoz.svc.cluster.local:4318/v1/logs + endpoint: http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318/v1/logs service: pipelines: diff --git a/docs/MONITORING_DOCUMENTATION.md b/docs/MONITORING_DOCUMENTATION.md index 8dea4b38..1eeea9a9 100644 --- a/docs/MONITORING_DOCUMENTATION.md +++ b/docs/MONITORING_DOCUMENTATION.md @@ -316,8 +316,8 @@ spec: #### Issue: No Metrics Appearing in SigNoz **Checklist:** -- ✅ OpenTelemetry Collector running? `kubectl get pods -n signoz` -- ✅ Service can reach collector? `telnet signoz-otel-collector.signoz 4318` +- ✅ OpenTelemetry Collector running? `kubectl get pods -n bakery-ia -l app.kubernetes.io/instance=signoz` +- ✅ Service can reach collector? `telnet signoz-otel-collector.bakery-ia 4318` - ✅ OTLP endpoint configured correctly? Check `OTEL_EXPORTER_OTLP_ENDPOINT` - ✅ Service logs show OTLP export? Look for "Exporting metrics" - ✅ No network policies blocking? Check Kubernetes network policies @@ -325,13 +325,13 @@ spec: **Debugging:** ```bash # Check OpenTelemetry Collector logs -kubectl logs -n signoz -l app=otel-collector +kubectl logs -n bakery-ia -l app=otel-collector # Check service logs for OTLP errors kubectl logs -l app=auth-service | grep -i otel # Test OTLP connectivity from service pod -kubectl exec -it auth-service-pod -- curl -v http://signoz-otel-collector.signoz:4318 +kubectl exec -it auth-service-pod -- curl -v http://signoz-otel-collector.bakery-ia:4318 ``` #### Issue: High Latency in Specific Service @@ -442,7 +442,7 @@ class MyService(StandardFastAPIService): ```env # OpenTelemetry Collector endpoint -OTEL_EXPORTER_OTLP_ENDPOINT=http://signoz-otel-collector.signoz:4318 +OTEL_EXPORTER_OTLP_ENDPOINT=http://signoz-otel-collector.bakery-ia:4318 # Service-specific configuration OTEL_SERVICE_NAME=auth-service @@ -473,7 +473,7 @@ spec: image: auth-service:latest env: - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz:4318" + value: "http://signoz-otel-collector.bakery-ia:4318" - name: OTEL_SERVICE_NAME value: "auth-service" - name: ENVIRONMENT diff --git a/gateway/app/main.py b/gateway/app/main.py index 70e908df..156fb637 100644 --- a/gateway/app/main.py +++ b/gateway/app/main.py @@ -48,7 +48,7 @@ def setup_tracing(service_name: str = "gateway"): # Configure OTLP exporter (sends to OpenTelemetry Collector) otlp_exporter = OTLPSpanExporter( - endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"), + endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"), insecure=True # Use insecure connection for internal cluster communication ) diff --git a/infrastructure/helm/README.md b/infrastructure/helm/README.md new file mode 100644 index 00000000..fb64cac2 --- /dev/null +++ b/infrastructure/helm/README.md @@ -0,0 +1,554 @@ +# SigNoz Helm Deployment for Bakery IA + +This directory contains Helm configurations and deployment scripts for SigNoz observability platform. + +## Overview + +SigNoz is deployed using the official Helm chart with environment-specific configurations optimized for: +- **Development**: Colima + Kind (Kubernetes in Docker) with Tilt +- **Production**: VPS on clouding.io with MicroK8s + +## Prerequisites + +### Required Tools +- **kubectl** 1.22+ +- **Helm** 3.8+ +- **Docker** (for development) +- **Kind/MicroK8s** (environment-specific) + +### Docker Hub Authentication + +SigNoz uses images from Docker Hub. Set up authentication to avoid rate limits: + +```bash +# Option 1: Environment variables (recommended) +export DOCKERHUB_USERNAME='your-username' +export DOCKERHUB_PASSWORD='your-personal-access-token' + +# Option 2: Docker login +docker login +``` + +## Quick Start + +### Development Deployment + +```bash +# Deploy SigNoz to development environment +./deploy-signoz.sh dev + +# Verify deployment +./verify-signoz.sh dev + +# Access SigNoz UI +# Via ingress: http://monitoring.bakery-ia.local +# Or port-forward: +kubectl port-forward -n signoz svc/signoz 8080:8080 +# Then open: http://localhost:8080 +``` + +### Production Deployment + +```bash +# Deploy SigNoz to production environment +./deploy-signoz.sh prod + +# Verify deployment +./verify-signoz.sh prod + +# Access SigNoz UI +# https://monitoring.bakewise.ai +``` + +## Configuration Files + +### signoz-values-dev.yaml + +Development environment configuration with: +- Single replica for most components +- Reduced resource requests (optimized for local Kind cluster) +- 7-day data retention +- Batch size: 10,000 events +- ClickHouse 25.5.6, OTel Collector v0.129.12 +- PostgreSQL, Redis, and RabbitMQ receivers configured + +### signoz-values-prod.yaml + +Production environment configuration with: +- High availability: 2+ replicas for critical components +- 3 Zookeeper replicas (required for production) +- 30-day data retention +- Batch size: 50,000 events (high-performance) +- Cold storage enabled with 30-day TTL +- Horizontal Pod Autoscaler (HPA) enabled +- TLS/SSL with cert-manager +- Enhanced security with pod anti-affinity rules + +## Key Configuration Changes (v0.89.0+) + +⚠️ **BREAKING CHANGE**: SigNoz Helm chart v0.89.0+ uses a unified component structure. + +**Old Structure (deprecated):** +```yaml +frontend: + replicaCount: 2 +queryService: + replicaCount: 2 +``` + +**New Structure (current):** +```yaml +signoz: + replicaCount: 2 + # Combines frontend + query service +``` + +## Component Architecture + +### Core Components + +1. **SigNoz** (unified component) + - Frontend UI + Query Service + - Port 8080 (HTTP/API), 8085 (internal gRPC) + - Dev: 1 replica, Prod: 2+ replicas with HPA + +2. **ClickHouse** (Time-series database) + - Version: 25.5.6 + - Stores traces, metrics, and logs + - Dev: 1 replica, Prod: 2 replicas with cold storage + +3. **Zookeeper** (ClickHouse coordination) + - Version: 3.7.1 + - Dev: 1 replica, Prod: 3 replicas (critical for HA) + +4. **OpenTelemetry Collector** (Data ingestion) + - Version: v0.129.12 + - Ports: 4317 (gRPC), 4318 (HTTP), 8888 (metrics) + - Dev: 1 replica, Prod: 2+ replicas with HPA + +5. **Alertmanager** (Alert management) + - Version: 0.23.5 + - Email and Slack integrations configured + - Port: 9093 + +## Performance Optimizations + +### Batch Processing +- **Development**: 10,000 events per batch +- **Production**: 50,000 events per batch (official recommendation) +- Timeout: 1 second for faster processing + +### Memory Management +- Memory limiter processor prevents OOM +- Dev: 400 MiB limit, Prod: 1500 MiB limit +- Spike limits configured + +### Span Metrics Processor +Automatically generates RED metrics (Rate, Errors, Duration): +- Latency histogram buckets optimized for microservices +- Cache size: 10K (dev), 100K (prod) + +### Cold Storage (Production Only) +- Enabled with 30-day TTL +- Automatically moves old data to cold storage +- Keeps 10GB free on primary storage + +## OpenTelemetry Endpoints + +### From Within Kubernetes Cluster + +**Development:** +``` +OTLP gRPC: signoz-otel-collector.bakery-ia.svc.cluster.local:4317 +OTLP HTTP: signoz-otel-collector.bakery-ia.svc.cluster.local:4318 +``` + +**Production:** +``` +OTLP gRPC: signoz-otel-collector.bakery-ia.svc.cluster.local:4317 +OTLP HTTP: signoz-otel-collector.bakery-ia.svc.cluster.local:4318 +``` + +### Application Configuration Example + +```yaml +# Python with OpenTelemetry +OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" +OTEL_EXPORTER_OTLP_PROTOCOL: "http/protobuf" +``` + +```javascript +// Node.js with OpenTelemetry +const exporter = new OTLPTraceExporter({ + url: 'http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318/v1/traces', +}); +``` + +## Deployment Scripts + +### deploy-signoz.sh + +Comprehensive deployment script with features: + +```bash +# Usage +./deploy-signoz.sh [OPTIONS] ENVIRONMENT + +# Options +-h, --help Show help message +-d, --dry-run Show what would be deployed +-u, --upgrade Upgrade existing deployment +-r, --remove Remove deployment +-n, --namespace NS Custom namespace (default: signoz) + +# Examples +./deploy-signoz.sh dev # Deploy to dev +./deploy-signoz.sh --upgrade prod # Upgrade prod +./deploy-signoz.sh --dry-run prod # Preview changes +./deploy-signoz.sh --remove dev # Remove dev deployment +``` + +**Features:** +- Automatic Helm repository setup +- Docker Hub secret creation +- Namespace management +- Deployment verification +- 15-minute timeout with `--wait` flag + +### verify-signoz.sh + +Verification script to check deployment health: + +```bash +# Usage +./verify-signoz.sh [OPTIONS] ENVIRONMENT + +# Examples +./verify-signoz.sh dev # Verify dev deployment +./verify-signoz.sh prod # Verify prod deployment +``` + +**Checks performed:** +1. ✅ Helm release status +2. ✅ Pod health and readiness +3. ✅ Service availability +4. ✅ Ingress configuration +5. ✅ PVC status +6. ✅ Resource usage (if metrics-server available) +7. ✅ Log errors +8. ✅ Environment-specific validations + - Dev: Single replica, resource limits + - Prod: HA config, TLS, Zookeeper replicas, HPA + +## Storage Configuration + +### Development (Kind) +```yaml +global: + storageClass: "standard" # Kind's default provisioner +``` + +### Production (MicroK8s) +```yaml +global: + storageClass: "microk8s-hostpath" # Or custom storage class +``` + +**Storage Requirements:** +- **Development**: ~35 GiB total + - SigNoz: 5 GiB + - ClickHouse: 20 GiB + - Zookeeper: 5 GiB + - Alertmanager: 2 GiB + +- **Production**: ~135 GiB total + - SigNoz: 20 GiB + - ClickHouse: 100 GiB + - Zookeeper: 10 GiB + - Alertmanager: 5 GiB + +## Resource Requirements + +### Development Environment +**Minimum:** +- CPU: 550m (0.55 cores) +- Memory: 1.6 GiB +- Storage: 35 GiB + +**Recommended:** +- CPU: 3 cores +- Memory: 3 GiB +- Storage: 50 GiB + +### Production Environment +**Minimum:** +- CPU: 3.5 cores +- Memory: 8 GiB +- Storage: 135 GiB + +**Recommended:** +- CPU: 12 cores +- Memory: 20 GiB +- Storage: 200 GiB + +## Data Retention + +### Development +- Traces: 7 days (168 hours) +- Metrics: 7 days (168 hours) +- Logs: 7 days (168 hours) + +### Production +- Traces: 30 days (720 hours) +- Metrics: 30 days (720 hours) +- Logs: 30 days (720 hours) +- Cold storage after 30 days + +To modify retention, update the environment variables: +```yaml +signoz: + env: + signoz_traces_ttl_duration_hrs: "720" # 30 days + signoz_metrics_ttl_duration_hrs: "720" # 30 days + signoz_logs_ttl_duration_hrs: "168" # 7 days +``` + +## High Availability (Production) + +### Replication Strategy +```yaml +signoz: 2 replicas + HPA (min: 2, max: 5) +clickhouse: 2 replicas +zookeeper: 3 replicas (critical!) +otelCollector: 2 replicas + HPA (min: 2, max: 10) +alertmanager: 2 replicas +``` + +### Pod Anti-Affinity +Ensures pods are distributed across different nodes: +```yaml +affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/component: query-service + topologyKey: kubernetes.io/hostname +``` + +### Pod Disruption Budgets +Configured for all critical components: +```yaml +podDisruptionBudget: + enabled: true + minAvailable: 1 +``` + +## Monitoring and Alerting + +### Email Alerts (Production) +Configure SMTP in production values: +```yaml +signoz: + env: + signoz_smtp_enabled: "true" + signoz_smtp_host: "smtp.gmail.com" + signoz_smtp_port: "587" + signoz_smtp_from: "alerts@bakewise.ai" + signoz_smtp_username: "alerts@bakewise.ai" + # Set via secret: signoz_smtp_password +``` + +### Slack Alerts (Production) +Configure webhook in Alertmanager: +```yaml +alertmanager: + config: + receivers: + - name: 'critical-alerts' + slack_configs: + - api_url: '${SLACK_WEBHOOK_URL}' + channel: '#alerts-critical' +``` + +### Self-Monitoring +SigNoz monitors itself: +```yaml +selfMonitoring: + enabled: true + serviceMonitor: + enabled: true # Prod only + interval: 30s +``` + +## Troubleshooting + +### Common Issues + +**1. Pods not starting** +```bash +# Check pod status +kubectl get pods -n signoz + +# Check pod logs +kubectl logs -n signoz + +# Describe pod for events +kubectl describe pod -n signoz +``` + +**2. Docker Hub rate limits** +```bash +# Verify secret exists +kubectl get secret dockerhub-creds -n signoz + +# Recreate secret +kubectl delete secret dockerhub-creds -n signoz +export DOCKERHUB_USERNAME='your-username' +export DOCKERHUB_PASSWORD='your-token' +./deploy-signoz.sh dev +``` + +**3. ClickHouse connection issues** +```bash +# Check ClickHouse pod +kubectl logs -n signoz -l app.kubernetes.io/component=clickhouse + +# Check Zookeeper (required by ClickHouse) +kubectl logs -n signoz -l app.kubernetes.io/component=zookeeper +``` + +**4. OTel Collector not receiving data** +```bash +# Check OTel Collector logs +kubectl logs -n signoz -l app.kubernetes.io/component=otel-collector + +# Test connectivity +kubectl port-forward -n signoz svc/signoz-otel-collector 4318:4318 +curl -v http://localhost:4318/v1/traces +``` + +**5. Insufficient storage** +```bash +# Check PVC status +kubectl get pvc -n signoz + +# Check storage usage (if metrics-server available) +kubectl top pods -n signoz +``` + +### Debug Mode + +Enable debug exporter in OTel Collector: +```yaml +otelCollector: + config: + exporters: + debug: + verbosity: detailed + sampling_initial: 5 + sampling_thereafter: 200 + service: + pipelines: + traces: + exporters: [clickhousetraces, debug] # Add debug +``` + +### Upgrade from Old Version + +If upgrading from pre-v0.89.0: +```bash +# 1. Backup data (recommended) +kubectl get all -n signoz -o yaml > signoz-backup.yaml + +# 2. Remove old deployment +./deploy-signoz.sh --remove prod + +# 3. Deploy new version +./deploy-signoz.sh prod + +# 4. Verify +./verify-signoz.sh prod +``` + +## Security Best Practices + +1. **Change default password** immediately after first login +2. **Use TLS/SSL** in production (configured with cert-manager) +3. **Network policies** enabled in production +4. **Run as non-root** (configured in securityContext) +5. **RBAC** with dedicated service account +6. **Secrets management** for sensitive data (SMTP, Slack webhooks) +7. **Image pull secrets** to avoid exposing Docker Hub credentials + +## Backup and Recovery + +### Backup ClickHouse Data +```bash +# Export ClickHouse data +kubectl exec -n signoz -- clickhouse-client \ + --query="BACKUP DATABASE signoz_traces TO Disk('backups', 'traces_backup.zip')" + +# Copy backup out +kubectl cp signoz/:/var/lib/clickhouse/backups/ ./backups/ +``` + +### Restore from Backup +```bash +# Copy backup in +kubectl cp ./backups/ signoz/:/var/lib/clickhouse/backups/ + +# Restore +kubectl exec -n signoz -- clickhouse-client \ + --query="RESTORE DATABASE signoz_traces FROM Disk('backups', 'traces_backup.zip')" +``` + +## Updating Configuration + +To update SigNoz configuration: + +1. Edit values file: `signoz-values-{env}.yaml` +2. Apply changes: + ```bash + ./deploy-signoz.sh --upgrade {env} + ``` +3. Verify: + ```bash + ./verify-signoz.sh {env} + ``` + +## Uninstallation + +```bash +# Remove SigNoz deployment +./deploy-signoz.sh --remove {env} + +# Optionally delete PVCs (WARNING: deletes all data) +kubectl delete pvc -n signoz -l app.kubernetes.io/instance=signoz + +# Optionally delete namespace +kubectl delete namespace signoz +``` + +## References + +- [SigNoz Official Documentation](https://signoz.io/docs/) +- [SigNoz Helm Charts Repository](https://github.com/SigNoz/charts) +- [OpenTelemetry Documentation](https://opentelemetry.io/docs/) +- [ClickHouse Documentation](https://clickhouse.com/docs/) + +## Support + +For issues or questions: +1. Check [SigNoz GitHub Issues](https://github.com/SigNoz/signoz/issues) +2. Review deployment logs: `kubectl logs -n signoz ` +3. Run verification script: `./verify-signoz.sh {env}` +4. Check [SigNoz Community Slack](https://signoz.io/slack) + +--- + +**Last Updated**: 2026-01-09 +**SigNoz Helm Chart Version**: Latest (v0.129.12 components) +**Maintained by**: Bakery IA Team diff --git a/infrastructure/helm/deploy-signoz.sh b/infrastructure/helm/deploy-signoz.sh index 5e4c370d..0af5ab50 100755 --- a/infrastructure/helm/deploy-signoz.sh +++ b/infrastructure/helm/deploy-signoz.sh @@ -30,7 +30,7 @@ show_help() { -d, --dry-run Dry run - show what would be done without actually deploying -u, --upgrade Upgrade existing deployment -r, --remove Remove/Uninstall SigNoz deployment - -n, --namespace NAMESPACE Specify namespace (default: signoz)" + -n, --namespace NAMESPACE Specify namespace (default: bakery-ia)" echo "" echo "Examples: $0 dev # Deploy to development @@ -51,7 +51,7 @@ show_help() { DRY_RUN=false UPGRADE=false REMOVE=false -NAMESPACE="signoz" +NAMESPACE="bakery-ia" while [[ $# -gt 0 ]]; do case $1 in @@ -208,92 +208,90 @@ create_dockerhub_secret() { echo "" } +# Function to add and update Helm repository +setup_helm_repo() { + echo "${BLUE}Setting up SigNoz Helm repository...${NC}" + + if [[ "$DRY_RUN" == true ]]; then + echo " (dry-run) Would add SigNoz Helm repository" + return + fi + + # Add SigNoz Helm repository + if helm repo list | grep -q "^signoz"; then + echo "${BLUE}SigNoz repository already added, updating...${NC}" + helm repo update signoz + else + echo "${BLUE}Adding SigNoz Helm repository...${NC}" + helm repo add signoz https://charts.signoz.io + helm repo update + fi + + echo "${GREEN}Helm repository ready.${NC}" + echo "" +} + # Function to deploy SigNoz deploy_signoz() { local values_file="infrastructure/helm/signoz-values-$ENVIRONMENT.yaml" - + if [[ ! -f "$values_file" ]]; then echo "${RED}Error: Values file $values_file not found.${NC}" exit 1 fi - + echo "${BLUE}Deploying SigNoz to $ENVIRONMENT environment...${NC}" echo " Using values file: $values_file" echo " Target namespace: $NAMESPACE" - + echo " Chart version: Latest from signoz/signoz" + if [[ "$DRY_RUN" == true ]]; then echo " (dry-run) Would deploy SigNoz with:" - echo " helm install signoz signoz/signoz -n $NAMESPACE -f $values_file" + echo " helm upgrade --install signoz signoz/signoz -n $NAMESPACE -f $values_file --wait --timeout 15m" return fi - + # Use upgrade --install to handle both new installations and upgrades echo "${BLUE}Installing/Upgrading SigNoz...${NC}" - helm upgrade --install signoz signoz/signoz -n "$NAMESPACE" -f "$values_file" - - echo "${GREEN}SigNoz deployment initiated.${NC}" - echo "Waiting for pods to become ready..." - - # Wait for deployment to complete - wait_for_deployment + echo "This may take 10-15 minutes..." + + helm upgrade --install signoz signoz/signoz \ + -n "$NAMESPACE" \ + -f "$values_file" \ + --wait \ + --timeout 15m \ + --create-namespace + + echo "${GREEN}SigNoz deployment completed.${NC}" + echo "" + + # Show deployment status + show_deployment_status } # Function to remove SigNoz remove_signoz() { echo "${BLUE}Removing SigNoz deployment from namespace $NAMESPACE...${NC}" - + if [[ "$DRY_RUN" == true ]]; then echo " (dry-run) Would remove SigNoz deployment" return fi - + if helm list -n "$NAMESPACE" | grep -q signoz; then - helm uninstall signoz -n "$NAMESPACE" + helm uninstall signoz -n "$NAMESPACE" --wait echo "${GREEN}SigNoz deployment removed.${NC}" + + # Optionally remove PVCs (commented out by default for safety) + echo "" + echo "${YELLOW}Note: Persistent Volume Claims (PVCs) were NOT deleted.${NC}" + echo "To delete PVCs and all data, run:" + echo " kubectl delete pvc -n $NAMESPACE -l app.kubernetes.io/instance=signoz" else echo "${YELLOW}No SigNoz deployment found in namespace $NAMESPACE.${NC}" fi } -# Function to wait for deployment to complete -wait_for_deployment() { - echo "${BLUE}Waiting for SigNoz pods to become ready...${NC}" - - # Wait for pods to be ready - local timeout=600 # 10 minutes - local start_time=$(date +%s) - - while true; do - local current_time=$(date +%s) - local elapsed=$((current_time - start_time)) - - if [[ $elapsed -ge $timeout ]]; then - echo "${RED}Timeout waiting for SigNoz pods to become ready.${NC}" - break - fi - - # Check pod status - local ready_pods=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz --field-selector=status.phase=Running 2>/dev/null | grep -c "Running" | tr -d '[:space:]' || echo "0") - local total_pods=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz 2>/dev/null | grep -v "NAME" | wc -l | tr -d '[:space:]' || echo "0") - - if [[ $ready_pods -eq 0 ]]; then - echo " Waiting for pods to start..." - else - echo " $ready_pods/$total_pods pods are running" - - if [[ $ready_pods -eq $total_pods && $total_pods -gt 0 ]]; then - echo "${GREEN}All SigNoz pods are running!${NC}" - break - fi - fi - - sleep 10 - done - - # Show deployment status - show_deployment_status -} - # Function to show deployment status show_deployment_status() { echo "" @@ -322,30 +320,36 @@ show_deployment_status() { # Function to show access information show_access_info() { echo "${BLUE}=== Access Information ===${NC}" - + if [[ "$ENVIRONMENT" == "dev" ]]; then - echo "SigNoz UI: https://localhost/signoz" - echo "SigNoz API: https://localhost/signoz-api" + echo "SigNoz UI: http://monitoring.bakery-ia.local" echo "" - echo "OpenTelemetry Collector Endpoints:" - echo " gRPC: localhost:4317" - echo " HTTP: localhost:4318" - echo " Metrics: localhost:8888" + echo "OpenTelemetry Collector Endpoints (from within cluster):" + echo " gRPC: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4317" + echo " HTTP: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4318" + echo "" + echo "Port-forward for local access:" + echo " kubectl port-forward -n $NAMESPACE svc/signoz 8080:8080" + echo " kubectl port-forward -n $NAMESPACE svc/signoz-otel-collector 4317:4317" + echo " kubectl port-forward -n $NAMESPACE svc/signoz-otel-collector 4318:4318" else - echo "SigNoz UI: https://monitoring.bakewise.ai/signoz" - echo "SigNoz API: https://monitoring.bakewise.ai/signoz-api" - echo "SigNoz Alerts: https://monitoring.bakewise.ai/signoz-alerts" + echo "SigNoz UI: https://monitoring.bakewise.ai" echo "" - echo "OpenTelemetry Collector Endpoints:" - echo " gRPC: monitoring.bakewise.ai:4317" - echo " HTTP: monitoring.bakewise.ai:4318" + echo "OpenTelemetry Collector Endpoints (from within cluster):" + echo " gRPC: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4317" + echo " HTTP: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4318" + echo "" + echo "External endpoints (if exposed):" + echo " Check ingress configuration for external OTLP endpoints" fi - + echo "" echo "Default credentials:" - echo " Username: admin" + echo " Username: admin@example.com" echo " Password: admin" echo "" + echo "Note: Change default password after first login!" + echo "" } # Main execution @@ -368,6 +372,9 @@ main() { exit 0 fi + # Setup Helm repository + setup_helm_repo + # Create Docker Hub secret for image pulls create_dockerhub_secret diff --git a/infrastructure/helm/signoz-values-dev.yaml b/infrastructure/helm/signoz-values-dev.yaml index 481c3ad5..b3ba28ed 100644 --- a/infrastructure/helm/signoz-values-dev.yaml +++ b/infrastructure/helm/signoz-values-dev.yaml @@ -1,11 +1,13 @@ # SigNoz Helm Chart Values - Development Environment # Optimized for local development with minimal resource usage +# DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by bakery-ingress # # Official Chart: https://github.com/SigNoz/charts -# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-dev.yaml +# Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-dev.yaml global: storageClass: "standard" + clusterName: "bakery-ia-dev" domain: "monitoring.bakery-ia.local" # Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc) imagePullSecrets: @@ -23,17 +25,10 @@ signoz: type: ClusterIP port: 8080 + # DISABLE built-in ingress - using unified bakery-ingress instead + # Route configured in infrastructure/kubernetes/overlays/dev/dev-ingress.yaml ingress: - enabled: true - className: nginx - annotations: {} - hosts: - - host: monitoring.bakery-ia.local - paths: - - path: / - pathType: Prefix - port: 8080 - tls: [] + enabled: false resources: requests: @@ -43,6 +38,17 @@ signoz: cpu: 1000m memory: 1Gi + # Environment variables (new format - replaces configVars) + env: + signoz_telemetrystore_provider: "clickhouse" + dot_metrics_enabled: "true" + signoz_emailing_enabled: "false" + signoz_alertmanager_provider: "signoz" + # Retention for dev (7 days) + signoz_traces_ttl_duration_hrs: "168" + signoz_metrics_ttl_duration_hrs: "168" + signoz_logs_ttl_duration_hrs: "168" + persistence: enabled: true size: 5Gi @@ -92,6 +98,11 @@ clickhouse: enabled: true installCustomStorageClass: false + image: + registry: docker.io + repository: clickhouse/clickhouse-server + tag: 25.5.6 # Official recommended version + # Reduce ClickHouse resource requests for local dev clickhouse: resources: @@ -102,15 +113,39 @@ clickhouse: cpu: 1000m memory: 1Gi + persistence: + enabled: true + size: 20Gi + # Zookeeper Configuration (required by ClickHouse) zookeeper: enabled: true + replicaCount: 1 # Single replica for dev + + image: + tag: 3.7.1 # Official recommended version + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + persistence: + enabled: true + size: 5Gi # OpenTelemetry Collector - Data ingestion endpoint for all telemetry otelCollector: enabled: true replicaCount: 1 + image: + repository: signoz/signoz-otel-collector + tag: v0.129.12 # Latest recommended version + # Service configuration - expose both gRPC and HTTP endpoints service: type: ClusterIP @@ -130,6 +165,11 @@ otelCollector: port: 8889 targetPort: 8889 protocol: TCP + # Metrics + - name: metrics + port: 8888 + targetPort: 8888 + protocol: TCP resources: requests: @@ -210,10 +250,11 @@ otelCollector: collection_interval: 60s processors: - # Batch processor for better performance + # Batch processor for better performance (optimized for high throughput) batch: - timeout: 10s - send_batch_size: 1024 + timeout: 1s + send_batch_size: 10000 # Increased from 1024 for better performance + send_batch_max_size: 10000 # Memory limiter to prevent OOM memory_limiter: @@ -223,35 +264,57 @@ otelCollector: # Resource detection resourcedetection: - detectors: [env, system] + detectors: [env, system, docker] timeout: 5s + # Span metrics processor for automatic service metrics + spanmetrics: + metrics_exporter: signozclickhousemetrics + latency_histogram_buckets: [2ms, 4ms, 6ms, 8ms, 10ms, 50ms, 100ms, 200ms, 400ms, 800ms, 1s, 1400ms, 2s, 5s, 10s, 15s] + dimensions_cache_size: 10000 + exporters: # ClickHouse exporter for traces clickhousetraces: datasource: tcp://signoz-clickhouse:9000/?database=signoz_traces timeout: 10s + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s # ClickHouse exporter for metrics signozclickhousemetrics: dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics" timeout: 10s + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s # ClickHouse exporter for logs clickhouselogsexporter: dsn: tcp://signoz-clickhouse:9000/?database=signoz_logs timeout: 10s + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s # Debug exporter for debugging (optional) debug: verbosity: detailed + sampling_initial: 5 + sampling_thereafter: 200 service: pipelines: # Traces pipeline traces: receivers: [otlp] - processors: [memory_limiter, batch, resourcedetection] + processors: [memory_limiter, batch, spanmetrics, resourcedetection] exporters: [clickhousetraces] # Metrics pipeline diff --git a/infrastructure/helm/signoz-values-prod.yaml b/infrastructure/helm/signoz-values-prod.yaml index 9a932067..4f3f331a 100644 --- a/infrastructure/helm/signoz-values-prod.yaml +++ b/infrastructure/helm/signoz-values-prod.yaml @@ -1,11 +1,13 @@ # SigNoz Helm Chart Values - Production Environment # High-availability configuration with resource optimization +# DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by bakery-ingress-prod # # Official Chart: https://github.com/SigNoz/charts -# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-prod.yaml +# Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml global: - storageClass: "standard" + storageClass: "standard" # For MicroK8s, use "microk8s-hostpath" or custom storage class + clusterName: "bakery-ia-prod" domain: "monitoring.bakewise.ai" # Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc) imagePullSecrets: @@ -15,43 +17,33 @@ global: imagePullSecrets: - dockerhub-creds -# Frontend Configuration -frontend: +# SigNoz Main Component (unified frontend + query service) +# BREAKING CHANGE: v0.89.0+ uses unified component instead of separate frontend/queryService +signoz: replicaCount: 2 + image: - repository: signoz/frontend - tag: 0.52.3 + repository: signoz/signoz + tag: v0.106.0 # Latest stable version pullPolicy: IfNotPresent service: type: ClusterIP - port: 3301 + port: 8080 # HTTP/API port + internalPort: 8085 # Internal gRPC port + # DISABLE built-in ingress - using unified bakery-ingress-prod instead + # Route configured in infrastructure/kubernetes/overlays/prod/prod-ingress.yaml ingress: - enabled: true - className: nginx - annotations: - nginx.ingress.kubernetes.io/rewrite-target: /$2 - nginx.ingress.kubernetes.io/use-regex: "true" - cert-manager.io/cluster-issuer: "letsencrypt-prod" - nginx.ingress.kubernetes.io/ssl-redirect: "true" - hosts: - - host: monitoring.bakewise.ai - paths: - - path: /signoz(/|$)(.*) - pathType: ImplementationSpecific - tls: - - secretName: signoz-tls - hosts: - - monitoring.bakewise.ai + enabled: false resources: requests: - cpu: 250m - memory: 512Mi - limits: cpu: 500m memory: 1Gi + limits: + cpu: 2000m + memory: 4Gi # Pod Anti-affinity for HA affinity: @@ -60,58 +52,27 @@ frontend: - weight: 100 podAffinityTerm: labelSelector: - matchExpressions: - - key: app - operator: In - values: - - signoz-frontend + matchLabels: + app.kubernetes.io/component: query-service topologyKey: kubernetes.io/hostname + # Environment variables (new format - replaces configVars) env: - - name: FRONTEND_REFRESH_INTERVAL - value: "30000" - -# Query Service Configuration -queryService: - replicaCount: 2 - image: - repository: signoz/query-service - tag: 0.52.3 - pullPolicy: IfNotPresent - - service: - type: ClusterIP - port: 8080 - - resources: - requests: - cpu: 500m - memory: 1Gi - limits: - cpu: 1000m - memory: 2Gi - - # Pod Anti-affinity for HA - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - signoz-query-service - topologyKey: kubernetes.io/hostname - - env: - - name: DEPLOYMENT_TYPE - value: "kubernetes-helm" - - name: SIGNOZ_LOCAL_DB_PATH - value: "/var/lib/signoz" - - name: RETENTION_DAYS - value: "30" + signoz_telemetrystore_provider: "clickhouse" + dot_metrics_enabled: "true" + signoz_emailing_enabled: "true" + signoz_alertmanager_provider: "signoz" + # Retention configuration (30 days for prod) + signoz_traces_ttl_duration_hrs: "720" + signoz_metrics_ttl_duration_hrs: "720" + signoz_logs_ttl_duration_hrs: "720" + # SMTP configuration for email alerts + signoz_smtp_enabled: "true" + signoz_smtp_host: "smtp.gmail.com" + signoz_smtp_port: "587" + signoz_smtp_from: "alerts@bakewise.ai" + signoz_smtp_username: "alerts@bakewise.ai" + # Password should be set via secret: signoz_smtp_password persistence: enabled: true @@ -128,7 +89,9 @@ queryService: # AlertManager Configuration alertmanager: + enabled: true replicaCount: 2 + image: repository: signoz/alertmanager tag: 0.23.5 @@ -140,11 +103,11 @@ alertmanager: resources: requests: - cpu: 250m - memory: 512Mi + cpu: 100m + memory: 128Mi limits: cpu: 500m - memory: 1Gi + memory: 512Mi # Pod Anti-affinity for HA affinity: @@ -210,24 +173,24 @@ alertmanager: # ClickHouse Configuration - Time Series Database clickhouse: - replicaCount: 2 + enabled: true + installCustomStorageClass: false + image: + registry: docker.io repository: clickhouse/clickhouse-server - tag: 24.1.2-alpine + tag: 25.5.6 # Updated to official recommended version pullPolicy: IfNotPresent - service: - type: ClusterIP - httpPort: 8123 - tcpPort: 9000 - - resources: - requests: - cpu: 1000m - memory: 2Gi - limits: - cpu: 2000m - memory: 4Gi + # ClickHouse resources (nested config) + clickhouse: + resources: + requests: + cpu: 1000m + memory: 2Gi + limits: + cpu: 4000m + memory: 8Gi # Pod Anti-affinity for HA affinity: @@ -246,50 +209,63 @@ clickhouse: size: 100Gi storageClass: "standard" - # ClickHouse configuration - config: - logger: - level: information - max_connections: 4096 - max_concurrent_queries: 500 - # Data retention (30 days for prod) - merge_tree: - parts_to_delay_insert: 150 - parts_to_throw_insert: 300 - # Performance tuning - max_memory_usage: 10000000000 - max_bytes_before_external_group_by: 20000000000 - - # Backup configuration - backup: + # Cold storage configuration for better disk space management + coldStorage: enabled: true - schedule: "0 2 * * *" - retention: 7 + defaultKeepFreeSpaceBytes: 10737418240 # Keep 10GB free + ttl: + deleteTTLDays: 30 # Move old data to cold storage after 30 days + +# Zookeeper Configuration (required by ClickHouse for coordination) +zookeeper: + enabled: true + replicaCount: 3 # CRITICAL: Always use 3 replicas for production HA + + image: + tag: 3.7.1 # Official recommended version + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + persistence: + enabled: true + size: 10Gi + storageClass: "standard" # OpenTelemetry Collector - Integrated with SigNoz otelCollector: enabled: true replicaCount: 2 + image: repository: signoz/signoz-otel-collector - tag: 0.102.8 + tag: v0.129.12 # Updated to latest recommended version pullPolicy: IfNotPresent service: type: ClusterIP ports: - otlpGrpc: 4317 - otlpHttp: 4318 - metrics: 8888 - healthCheck: 13133 + - name: otlp-grpc + port: 4317 + - name: otlp-http + port: 4318 + - name: metrics + port: 8888 + - name: healthcheck + port: 13133 resources: requests: cpu: 500m memory: 512Mi limits: - cpu: 1000m - memory: 1Gi + cpu: 2000m + memory: 2Gi # Full OTEL Collector Configuration config: @@ -304,7 +280,7 @@ otelCollector: protocols: grpc: endpoint: 0.0.0.0:4317 - max_recv_msg_size_mib: 16 + max_recv_msg_size_mib: 32 # Increased for larger payloads http: endpoint: 0.0.0.0:4318 cors: @@ -322,19 +298,20 @@ otelCollector: - targets: ['localhost:8888'] processors: + # High-performance batch processing (official recommendation) batch: - timeout: 10s - send_batch_size: 2048 - send_batch_max_size: 4096 + timeout: 1s # Reduced from 10s for faster processing + send_batch_size: 50000 # Increased from 2048 (official recommendation for traces) + send_batch_max_size: 50000 memory_limiter: check_interval: 1s - limit_mib: 800 - spike_limit_mib: 200 + limit_mib: 1500 # 75% of container memory (2Gi = ~2048Mi) + spike_limit_mib: 300 # Resource detection for K8s resourcedetection: - detectors: [env, system, docker] + detectors: [env, system, docker, kubernetes] timeout: 5s # Add resource attributes @@ -347,6 +324,12 @@ otelCollector: value: bakery-ia-prod action: upsert + # Span metrics processor for automatic service performance metrics + spanmetrics: + metrics_exporter: signozclickhousemetrics + latency_histogram_buckets: [2ms, 4ms, 6ms, 8ms, 10ms, 50ms, 100ms, 200ms, 400ms, 800ms, 1s, 1400ms, 2s, 5s, 10s, 15s] + dimensions_cache_size: 100000 + exporters: # Export to SigNoz ClickHouse clickhousetraces: @@ -387,8 +370,8 @@ otelCollector: pipelines: traces: receivers: [otlp] - processors: [memory_limiter, batch, resourcedetection, resource] - exporters: [clickhousetraces, debug] + processors: [memory_limiter, batch, spanmetrics, resourcedetection, resource] + exporters: [clickhousetraces] metrics: receivers: [otlp, prometheus] @@ -398,12 +381,7 @@ otelCollector: logs: receivers: [otlp] processors: [memory_limiter, batch, resourcedetection, resource] - exporters: [clickhouselogsexporter, debug] - -# OpenTelemetry Collector Deployment Mode -otelCollectorDeployment: - enabled: true - mode: deployment + exporters: [clickhouselogsexporter] # HPA for OTEL Collector autoscaling: @@ -413,29 +391,18 @@ otelCollectorDeployment: targetCPUUtilizationPercentage: 70 targetMemoryUtilizationPercentage: 80 -# Node Exporter for infrastructure metrics -nodeExporter: +# Schema Migrator - Manages ClickHouse schema migrations +schemaMigrator: enabled: true - service: - type: ClusterIP - port: 9100 - resources: - requests: - cpu: 100m - memory: 128Mi - limits: - cpu: 200m - memory: 256Mi - -# Schemamanager - Manages ClickHouse schema -schemamanager: - enabled: true image: repository: signoz/signoz-schema-migrator - tag: 0.52.3 + tag: v0.129.12 # Updated to latest version pullPolicy: IfNotPresent + # Enable Helm hooks for proper upgrade handling + upgradeHelmHooks: true + # Additional Configuration serviceAccount: create: true diff --git a/infrastructure/helm/verify-signoz.sh b/infrastructure/helm/verify-signoz.sh index 8340d12c..da1197a7 100755 --- a/infrastructure/helm/verify-signoz.sh +++ b/infrastructure/helm/verify-signoz.sh @@ -26,7 +26,7 @@ show_help() { echo "" echo "Options: -h, --help Show this help message - -n, --namespace NAMESPACE Specify namespace (default: signoz)" + -n, --namespace NAMESPACE Specify namespace (default: bakery-ia)" echo "" echo "Examples: $0 dev # Verify development deployment @@ -35,7 +35,7 @@ show_help() { } # Parse command line arguments -NAMESPACE="signoz" +NAMESPACE="bakery-ia" while [[ $# -gt 0 ]]; do case $1 in @@ -224,20 +224,28 @@ verify_deployment() { # Function for development-specific verification verify_dev_specific() { echo "${BLUE}8. Development-specific checks...${NC}" - - # Check if localhost ingress is configured - if kubectl get ingress -n "$NAMESPACE" | grep -q "localhost"; then - echo "${GREEN}✅ Localhost ingress configured${NC}" + + # Check if ingress is configured + if kubectl get ingress -n "$NAMESPACE" 2>/dev/null | grep -q "monitoring.bakery-ia.local"; then + echo "${GREEN}✅ Development ingress configured${NC}" else - echo "${YELLOW}⚠️ Localhost ingress not found${NC}" + echo "${YELLOW}⚠️ Development ingress not found${NC}" fi - - # Check resource limits (should be lower for dev) - local query_service=$(kubectl get deployment -n "$NAMESPACE" signoz-query-service -o jsonpath='{.spec.template.spec.containers[0].resources.limits.memory}' 2>/dev/null || echo "") - if [[ -n "$query_service" && "$query_service" == "512Mi" ]]; then - echo "${GREEN}✅ Development resource limits applied${NC}" + + # Check unified signoz component resource limits (should be lower for dev) + local signoz_mem=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/component=query-service -o jsonpath='{.items[0].spec.template.spec.containers[0].resources.limits.memory}' 2>/dev/null || echo "") + if [[ -n "$signoz_mem" ]]; then + echo "${GREEN}✅ SigNoz component found (memory limit: $signoz_mem)${NC}" else - echo "${YELLOW}⚠️ Resource limits may not be optimized for development${NC}" + echo "${YELLOW}⚠️ Could not verify SigNoz component resources${NC}" + fi + + # Check single replica setup for dev + local replicas=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/component=query-service -o jsonpath='{.items[0].spec.replicas}' 2>/dev/null || echo "0") + if [[ $replicas -eq 1 ]]; then + echo "${GREEN}✅ Single replica configuration (appropriate for dev)${NC}" + else + echo "${YELLOW}⚠️ Multiple replicas detected (replicas: $replicas)${NC}" fi echo "" } @@ -245,28 +253,54 @@ verify_dev_specific() { # Function for production-specific verification verify_prod_specific() { echo "${BLUE}8. Production-specific checks...${NC}" - + # Check if TLS is configured - if kubectl get ingress -n "$NAMESPACE" | grep -q "signoz-tls-cert"; then + if kubectl get ingress -n "$NAMESPACE" 2>/dev/null | grep -q "signoz-tls"; then echo "${GREEN}✅ TLS certificate configured${NC}" else echo "${YELLOW}⚠️ TLS certificate not found${NC}" fi - - # Check if multiple replicas are running - local query_replicas=$(kubectl get deployment -n "$NAMESPACE" signoz-query-service -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "1") - if [[ $query_replicas -gt 1 ]]; then - echo "${GREEN}✅ High availability configured ($query_replicas replicas)${NC}" + + # Check if multiple replicas are running for HA + local signoz_replicas=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/component=query-service -o jsonpath='{.items[0].spec.replicas}' 2>/dev/null || echo "1") + if [[ $signoz_replicas -gt 1 ]]; then + echo "${GREEN}✅ High availability configured ($signoz_replicas SigNoz replicas)${NC}" else - echo "${YELLOW}⚠️ Single replica detected (not highly available)${NC}" + echo "${YELLOW}⚠️ Single SigNoz replica detected (not highly available)${NC}" fi - - # Check resource limits (should be higher for prod) - local query_service=$(kubectl get deployment -n "$NAMESPACE" signoz-query-service -o jsonpath='{.spec.template.spec.containers[0].resources.limits.memory}' 2>/dev/null || echo "") - if [[ -n "$query_service" && "$query_service" == "2Gi" ]]; then - echo "${GREEN}✅ Production resource limits applied${NC}" + + # Check Zookeeper replicas (critical for production) + local zk_replicas=$(kubectl get statefulset -n "$NAMESPACE" -l app.kubernetes.io/component=zookeeper -o jsonpath='{.items[0].spec.replicas}' 2>/dev/null || echo "0") + if [[ $zk_replicas -eq 3 ]]; then + echo "${GREEN}✅ Zookeeper properly configured with 3 replicas${NC}" + elif [[ $zk_replicas -gt 0 ]]; then + echo "${YELLOW}⚠️ Zookeeper has $zk_replicas replicas (recommend 3 for production)${NC}" else - echo "${YELLOW}⚠️ Resource limits may not be optimized for production${NC}" + echo "${RED}❌ Zookeeper not found${NC}" + fi + + # Check OTel Collector replicas + local otel_replicas=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/component=otel-collector -o jsonpath='{.items[0].spec.replicas}' 2>/dev/null || echo "1") + if [[ $otel_replicas -gt 1 ]]; then + echo "${GREEN}✅ OTel Collector HA configured ($otel_replicas replicas)${NC}" + else + echo "${YELLOW}⚠️ Single OTel Collector replica${NC}" + fi + + # Check resource limits (should be higher for prod) + local signoz_mem=$(kubectl get deployment -n "$NAMESPACE" -l app.kubernetes.io/component=query-service -o jsonpath='{.items[0].spec.template.spec.containers[0].resources.limits.memory}' 2>/dev/null || echo "") + if [[ -n "$signoz_mem" ]]; then + echo "${GREEN}✅ Production resource limits applied (memory: $signoz_mem)${NC}" + else + echo "${YELLOW}⚠️ Could not verify resource limits${NC}" + fi + + # Check HPA (Horizontal Pod Autoscaler) + local hpa_count=$(kubectl get hpa -n "$NAMESPACE" 2>/dev/null | grep -c signoz || echo "0") + if [[ $hpa_count -gt 0 ]]; then + echo "${GREEN}✅ Horizontal Pod Autoscaler configured${NC}" + else + echo "${YELLOW}⚠️ No HPA found (consider enabling for production)${NC}" fi echo "" } @@ -278,39 +312,50 @@ show_access_info() { echo "📋 Access Information" echo "==========================================" echo "${NC}" - + if [[ "$ENVIRONMENT" == "dev" ]]; then - echo "SigNoz UI: https://localhost/signoz" - echo "SigNoz API: https://localhost/signoz-api" + echo "SigNoz UI: http://monitoring.bakery-ia.local" echo "" - echo "OpenTelemetry Collector:" - echo " gRPC: localhost:4317" - echo " HTTP: localhost:4318" - echo " Metrics: localhost:8888" + echo "OpenTelemetry Collector (within cluster):" + echo " gRPC: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4317" + echo " HTTP: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4318" + echo "" + echo "Port-forward for local access:" + echo " kubectl port-forward -n $NAMESPACE svc/signoz 8080:8080" + echo " kubectl port-forward -n $NAMESPACE svc/signoz-otel-collector 4317:4317" + echo " kubectl port-forward -n $NAMESPACE svc/signoz-otel-collector 4318:4318" else - echo "SigNoz UI: https://monitoring.bakewise.ai/signoz" - echo "SigNoz API: https://monitoring.bakewise.ai/signoz-api" - echo "SigNoz Alerts: https://monitoring.bakewise.ai/signoz-alerts" + echo "SigNoz UI: https://monitoring.bakewise.ai" echo "" - echo "OpenTelemetry Collector:" - echo " gRPC: monitoring.bakewise.ai:4317" - echo " HTTP: monitoring.bakewise.ai:4318" + echo "OpenTelemetry Collector (within cluster):" + echo " gRPC: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4317" + echo " HTTP: signoz-otel-collector.$NAMESPACE.svc.cluster.local:4318" fi - + echo "" echo "Default Credentials:" - echo " Username: admin" + echo " Username: admin@example.com" echo " Password: admin" echo "" - + echo "⚠️ IMPORTANT: Change default password after first login!" + echo "" + # Show connection test commands echo "Connection Test Commands:" if [[ "$ENVIRONMENT" == "dev" ]]; then - echo " curl -k https://localhost/signoz" - echo " curl -k https://localhost/signoz-api/health" + echo " # Test SigNoz UI" + echo " curl http://monitoring.bakery-ia.local" + echo "" + echo " # Test via port-forward" + echo " kubectl port-forward -n $NAMESPACE svc/signoz 8080:8080" + echo " curl http://localhost:8080" else - echo " curl https://monitoring.bakewise.ai/signoz" - echo " curl https://monitoring.bakewise.ai/signoz-api/health" + echo " # Test SigNoz UI" + echo " curl https://monitoring.bakewise.ai" + echo "" + echo " # Test API health" + echo " kubectl port-forward -n $NAMESPACE svc/signoz 8080:8080" + echo " curl http://localhost:8080/api/v1/health" fi echo "" } @@ -322,36 +367,43 @@ run_connectivity_tests() { echo "🔗 Running Connectivity Tests" echo "==========================================" echo "${NC}" - - if [[ "$ENVIRONMENT" == "dev" ]]; then - # Test frontend - echo "Testing SigNoz frontend..." - if curl -k -s -o /dev/null -w "%{http_code}" https://localhost/signoz | grep -q "200\|302"; then - echo "${GREEN}✅ Frontend accessible${NC}" - else - echo "${RED}❌ Frontend not accessible${NC}" - fi - - # Test API - echo "Testing SigNoz API..." - if curl -k -s -o /dev/null -w "%{http_code}" https://localhost/signoz-api/health | grep -q "200"; then - echo "${GREEN}✅ API accessible${NC}" - else - echo "${RED}❌ API not accessible${NC}" - fi - - # Test OTEL collector - echo "Testing OpenTelemetry collector..." - if curl -s -o /dev/null -w "%{http_code}" http://localhost:8888/metrics | grep -q "200"; then - echo "${GREEN}✅ OTEL collector accessible${NC}" - else - echo "${YELLOW}⚠️ OTEL collector not accessible (may not be exposed)${NC}" - fi + + # Test pod readiness first + echo "Checking pod readiness..." + local ready_pods=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz --field-selector=status.phase=Running 2>/dev/null | grep "Running" | grep -c "1/1\|2/2" || echo "0") + local total_pods=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz 2>/dev/null | grep -v "NAME" | wc -l | tr -d ' ' || echo "0") + + if [[ $ready_pods -eq $total_pods && $total_pods -gt 0 ]]; then + echo "${GREEN}✅ All pods are ready ($ready_pods/$total_pods)${NC}" else - echo "${YELLOW}⚠️ Production connectivity tests require valid DNS and TLS${NC}" - echo " Please ensure monitoring.bakewise.ai resolves to your cluster" + echo "${YELLOW}⚠️ Some pods not ready ($ready_pods/$total_pods)${NC}" fi echo "" + + # Test internal service connectivity + echo "Testing internal service connectivity..." + local signoz_svc=$(kubectl get svc -n "$NAMESPACE" signoz -o jsonpath='{.spec.clusterIP}' 2>/dev/null || echo "") + if [[ -n "$signoz_svc" ]]; then + echo "${GREEN}✅ SigNoz service accessible at $signoz_svc:8080${NC}" + else + echo "${RED}❌ SigNoz service not found${NC}" + fi + + local otel_svc=$(kubectl get svc -n "$NAMESPACE" signoz-otel-collector -o jsonpath='{.spec.clusterIP}' 2>/dev/null || echo "") + if [[ -n "$otel_svc" ]]; then + echo "${GREEN}✅ OTel Collector service accessible at $otel_svc:4317 (gRPC), $otel_svc:4318 (HTTP)${NC}" + else + echo "${RED}❌ OTel Collector service not found${NC}" + fi + echo "" + + if [[ "$ENVIRONMENT" == "prod" ]]; then + echo "${YELLOW}⚠️ Production connectivity tests require valid DNS and TLS${NC}" + echo " Please ensure monitoring.bakewise.ai resolves to your cluster" + echo "" + echo "Manual test:" + echo " curl -I https://monitoring.bakewise.ai" + fi } # Main execution diff --git a/infrastructure/kubernetes/base/components/ai-insights/ai-insights-service.yaml b/infrastructure/kubernetes/base/components/ai-insights/ai-insights-service.yaml index 0a12744f..91a40801 100644 --- a/infrastructure/kubernetes/base/components/ai-insights/ai-insights-service.yaml +++ b/infrastructure/kubernetes/base/components/ai-insights/ai-insights-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "ai-insights-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/auth/auth-service.yaml b/infrastructure/kubernetes/base/components/auth/auth-service.yaml index b66aa0c0..d18d4559 100644 --- a/infrastructure/kubernetes/base/components/auth/auth-service.yaml +++ b/infrastructure/kubernetes/base/components/auth/auth-service.yaml @@ -98,9 +98,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "auth-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/distribution/distribution-service.yaml b/infrastructure/kubernetes/base/components/distribution/distribution-service.yaml index 78773ce8..2541a535 100644 --- a/infrastructure/kubernetes/base/components/distribution/distribution-service.yaml +++ b/infrastructure/kubernetes/base/components/distribution/distribution-service.yaml @@ -62,9 +62,9 @@ spec: value: "3" # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "distribution-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/external/external-service.yaml b/infrastructure/kubernetes/base/components/external/external-service.yaml index ca64c606..24b03019 100644 --- a/infrastructure/kubernetes/base/components/external/external-service.yaml +++ b/infrastructure/kubernetes/base/components/external/external-service.yaml @@ -90,9 +90,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "external-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/forecasting/forecasting-service.yaml b/infrastructure/kubernetes/base/components/forecasting/forecasting-service.yaml index e118b48b..a318d23a 100644 --- a/infrastructure/kubernetes/base/components/forecasting/forecasting-service.yaml +++ b/infrastructure/kubernetes/base/components/forecasting/forecasting-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "forecasting-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml b/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml index a02b3d5f..5f26a98a 100644 --- a/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml +++ b/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml @@ -52,7 +52,7 @@ spec: name: whatsapp-secrets env: - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4317" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317" resources: requests: memory: "256Mi" diff --git a/infrastructure/kubernetes/base/components/inventory/inventory-service.yaml b/infrastructure/kubernetes/base/components/inventory/inventory-service.yaml index 37fe58d6..ef0c53fc 100644 --- a/infrastructure/kubernetes/base/components/inventory/inventory-service.yaml +++ b/infrastructure/kubernetes/base/components/inventory/inventory-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "inventory-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/notification/notification-service.yaml b/infrastructure/kubernetes/base/components/notification/notification-service.yaml index 22873832..a21cd549 100644 --- a/infrastructure/kubernetes/base/components/notification/notification-service.yaml +++ b/infrastructure/kubernetes/base/components/notification/notification-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "notification-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/orchestrator/orchestrator-service.yaml b/infrastructure/kubernetes/base/components/orchestrator/orchestrator-service.yaml index 0b2f53f5..c48a7ee3 100644 --- a/infrastructure/kubernetes/base/components/orchestrator/orchestrator-service.yaml +++ b/infrastructure/kubernetes/base/components/orchestrator/orchestrator-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "orchestrator-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/orders/orders-service.yaml b/infrastructure/kubernetes/base/components/orders/orders-service.yaml index 284a22d4..2ec3b955 100644 --- a/infrastructure/kubernetes/base/components/orders/orders-service.yaml +++ b/infrastructure/kubernetes/base/components/orders/orders-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "orders-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/pos/pos-service.yaml b/infrastructure/kubernetes/base/components/pos/pos-service.yaml index 6e3496b0..771d4a96 100644 --- a/infrastructure/kubernetes/base/components/pos/pos-service.yaml +++ b/infrastructure/kubernetes/base/components/pos/pos-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "pos-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/procurement/procurement-service.yaml b/infrastructure/kubernetes/base/components/procurement/procurement-service.yaml index 4b766871..283c2858 100644 --- a/infrastructure/kubernetes/base/components/procurement/procurement-service.yaml +++ b/infrastructure/kubernetes/base/components/procurement/procurement-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "procurement-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/production/production-service.yaml b/infrastructure/kubernetes/base/components/production/production-service.yaml index 6515d35a..5cedcb91 100644 --- a/infrastructure/kubernetes/base/components/production/production-service.yaml +++ b/infrastructure/kubernetes/base/components/production/production-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "production-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/recipes/recipes-service.yaml b/infrastructure/kubernetes/base/components/recipes/recipes-service.yaml index 64aed0c4..5bd25974 100644 --- a/infrastructure/kubernetes/base/components/recipes/recipes-service.yaml +++ b/infrastructure/kubernetes/base/components/recipes/recipes-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "recipes-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/sales/sales-service.yaml b/infrastructure/kubernetes/base/components/sales/sales-service.yaml index 33390c3e..4b93aaf4 100644 --- a/infrastructure/kubernetes/base/components/sales/sales-service.yaml +++ b/infrastructure/kubernetes/base/components/sales/sales-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "sales-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/suppliers/suppliers-service.yaml b/infrastructure/kubernetes/base/components/suppliers/suppliers-service.yaml index edab7b66..b8e8c651 100644 --- a/infrastructure/kubernetes/base/components/suppliers/suppliers-service.yaml +++ b/infrastructure/kubernetes/base/components/suppliers/suppliers-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "suppliers-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/tenant/tenant-service.yaml b/infrastructure/kubernetes/base/components/tenant/tenant-service.yaml index bad816c8..919fd2a2 100644 --- a/infrastructure/kubernetes/base/components/tenant/tenant-service.yaml +++ b/infrastructure/kubernetes/base/components/tenant/tenant-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "tenant-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/components/training/training-service.yaml b/infrastructure/kubernetes/base/components/training/training-service.yaml index 4504e0ae..620869e0 100644 --- a/infrastructure/kubernetes/base/components/training/training-service.yaml +++ b/infrastructure/kubernetes/base/components/training/training-service.yaml @@ -97,9 +97,9 @@ spec: env: # OpenTelemetry Configuration - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318" - name: OTEL_SERVICE_NAME value: "training-service" - name: ENABLE_TRACING diff --git a/infrastructure/kubernetes/base/configmap.yaml b/infrastructure/kubernetes/base/configmap.yaml index c973200b..e7d0f767 100644 --- a/infrastructure/kubernetes/base/configmap.yaml +++ b/infrastructure/kubernetes/base/configmap.yaml @@ -385,13 +385,13 @@ data: # OBSERVABILITY - SigNoz (Unified Monitoring) # ================================================================ # OpenTelemetry Configuration - Direct to SigNoz - OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.signoz.svc.cluster.local:4317" + OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317" OTEL_EXPORTER_OTLP_PROTOCOL: "grpc" OTEL_SERVICE_NAME: "bakery-ia" OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development" # SigNoz Endpoints (v0.106.0+ unified service) - SIGNOZ_ENDPOINT: "http://signoz.signoz.svc.cluster.local:8080" + SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080" SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local" # ================================================================ diff --git a/infrastructure/kubernetes/overlays/dev/dev-ingress.yaml b/infrastructure/kubernetes/overlays/dev/dev-ingress.yaml index ed9394f4..43059933 100644 --- a/infrastructure/kubernetes/overlays/dev/dev-ingress.yaml +++ b/infrastructure/kubernetes/overlays/dev/dev-ingress.yaml @@ -73,7 +73,14 @@ spec: name: gateway-service port: number: 8000 - # Note: SigNoz monitoring is deployed via Helm in the 'signoz' namespace - # SigNoz creates its own Ingress via Helm chart configuration (signoz-values-dev.yaml) - # Access at: https://monitoring.bakery-ia.local/ - # SignOz is served at the root of the monitoring subdomain \ No newline at end of file + # SigNoz Monitoring on subdomain (deployed via Helm in bakery-ia namespace) + - host: monitoring.bakery-ia.local + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: signoz + port: + number: 8080 \ No newline at end of file diff --git a/infrastructure/kubernetes/overlays/prod/kustomization.yaml b/infrastructure/kubernetes/overlays/prod/kustomization.yaml index 7e9a20e4..6acc7a0c 100644 --- a/infrastructure/kubernetes/overlays/prod/kustomization.yaml +++ b/infrastructure/kubernetes/overlays/prod/kustomization.yaml @@ -61,7 +61,7 @@ patches: value: "true" - op: add path: /data/OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4317" + value: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317" - op: add path: /data/OTEL_EXPORTER_OTLP_PROTOCOL value: "grpc" diff --git a/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml b/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml index 0d70c1c0..b253bfcd 100644 --- a/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml +++ b/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml @@ -23,13 +23,13 @@ data: ENABLE_LOGS: "true" # OpenTelemetry Configuration - Direct to SigNoz - OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.signoz.svc.cluster.local:4317" + OTEL_EXPORTER_OTLP_ENDPOINT: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317" OTEL_EXPORTER_OTLP_PROTOCOL: "grpc" OTEL_SERVICE_NAME: "bakery-ia" OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=production,cluster.name=bakery-ia-prod" # SigNoz Endpoints (v0.106.0+ unified service) - SIGNOZ_ENDPOINT: "http://signoz.signoz.svc.cluster.local:8080" + SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080" SIGNOZ_FRONTEND_URL: "https://monitoring.bakewise.ai" SIGNOZ_ROOT_URL: "https://monitoring.bakewise.ai" diff --git a/infrastructure/kubernetes/overlays/prod/prod-ingress.yaml b/infrastructure/kubernetes/overlays/prod/prod-ingress.yaml index aced44c8..378beca7 100644 --- a/infrastructure/kubernetes/overlays/prod/prod-ingress.yaml +++ b/infrastructure/kubernetes/overlays/prod/prod-ingress.yaml @@ -41,6 +41,7 @@ spec: tls: - hosts: - bakewise.ai + - monitoring.bakewise.ai secretName: bakery-ia-prod-tls-cert rules: - host: bakewise.ai @@ -60,6 +61,14 @@ spec: name: gateway-service port: number: 8000 - # Note: SigNoz monitoring is deployed via Helm in the 'signoz' namespace - # SigNoz creates its own Ingress via Helm chart configuration - # Access at: https://monitoring.bakewise.ai (configured in signoz-values-prod.yaml) + # SigNoz Monitoring on subdomain (deployed via Helm in bakery-ia namespace) + - host: monitoring.bakewise.ai + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: signoz + port: + number: 8080 diff --git a/services/ai_insights/app/main.py b/services/ai_insights/app/main.py index 32f7b06b..80337206 100644 --- a/services/ai_insights/app/main.py +++ b/services/ai_insights/app/main.py @@ -30,7 +30,7 @@ def setup_tracing(service_name: str = "ai-insights"): resource = Resource.create({"service.name": service_name}) otlp_exporter = OTLPSpanExporter( - endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"), + endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"), insecure=True ) diff --git a/services/alert_processor/app/main.py b/services/alert_processor/app/main.py index a22b25b8..a40ccd1c 100644 --- a/services/alert_processor/app/main.py +++ b/services/alert_processor/app/main.py @@ -35,7 +35,7 @@ def setup_tracing(service_name: str = "alert-processor"): resource = Resource.create({"service.name": service_name}) otlp_exporter = OTLPSpanExporter( - endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"), + endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"), insecure=True ) diff --git a/services/demo_session/app/main.py b/services/demo_session/app/main.py index c3e61f5e..504cbd2b 100644 --- a/services/demo_session/app/main.py +++ b/services/demo_session/app/main.py @@ -33,7 +33,7 @@ def setup_tracing(service_name: str = "demo-session"): resource = Resource.create({"service.name": service_name}) otlp_exporter = OTLPSpanExporter( - endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://otel-collector.monitoring.svc.cluster.local:4317"), + endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"), insecure=True ) diff --git a/shared/monitoring/logs_exporter.py b/shared/monitoring/logs_exporter.py index 7c9ef91d..0dde34c3 100644 --- a/shared/monitoring/logs_exporter.py +++ b/shared/monitoring/logs_exporter.py @@ -68,7 +68,7 @@ def setup_otel_logging( if otel_endpoint is None: otel_endpoint = os.getenv( "OTEL_EXPORTER_OTLP_ENDPOINT", - os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.signoz:4318") + os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318") ) # Ensure endpoint has /v1/logs path for HTTP diff --git a/shared/monitoring/metrics_exporter.py b/shared/monitoring/metrics_exporter.py index 3f35a30d..6a4020eb 100644 --- a/shared/monitoring/metrics_exporter.py +++ b/shared/monitoring/metrics_exporter.py @@ -69,7 +69,7 @@ def setup_otel_metrics( if otel_endpoint is None: otel_endpoint = os.getenv( "OTEL_EXPORTER_OTLP_ENDPOINT", - os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.signoz:4318") + os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318") ) # Ensure endpoint has /v1/metrics path for HTTP diff --git a/shared/monitoring/tracing.py b/shared/monitoring/tracing.py index 79222d5c..baf76bef 100755 --- a/shared/monitoring/tracing.py +++ b/shared/monitoring/tracing.py @@ -22,7 +22,7 @@ def setup_tracing( app, service_name: str, service_version: str = "1.0.0", - otel_endpoint: str = "http://signoz-otel-collector.signoz:4318" + otel_endpoint: str = "http://signoz-otel-collector.bakery-ia:4318" ): """ Setup OpenTelemetry distributed tracing for a FastAPI service. diff --git a/shared/service_base.py b/shared/service_base.py index 5dc22ce6..3ecddbfe 100755 --- a/shared/service_base.py +++ b/shared/service_base.py @@ -151,7 +151,7 @@ class BaseFastAPIService: try: otel_endpoint = os.getenv( "OTEL_COLLECTOR_ENDPOINT", - "http://signoz-otel-collector.signoz:4318" + "http://signoz-otel-collector.bakery-ia:4318" ) setup_tracing(self.app, self.service_name, self.version, otel_endpoint) self.logger.info(f"Distributed tracing enabled for {self.service_name}")