# SigNoz Helm Chart Values - Production Environment # High-availability configuration with resource optimization # DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by bakery-ingress-prod # # Official Chart: https://github.com/SigNoz/charts # Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml global: storageClass: "standard" # For MicroK8s, use "microk8s-hostpath" or custom storage class clusterName: "bakery-ia-prod" domain: "monitoring.bakewise.ai" # Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc) imagePullSecrets: - dockerhub-creds # Docker Hub credentials for pulling images (root level for SigNoz components) imagePullSecrets: - dockerhub-creds # SigNoz Main Component (unified frontend + query service) # BREAKING CHANGE: v0.89.0+ uses unified component instead of separate frontend/queryService signoz: replicaCount: 2 image: repository: signoz/signoz tag: v0.106.0 # Latest stable version pullPolicy: IfNotPresent service: type: ClusterIP port: 8080 # HTTP/API port internalPort: 8085 # Internal gRPC port # DISABLE built-in ingress - using unified bakery-ingress-prod instead # Route configured in infrastructure/kubernetes/overlays/prod/prod-ingress.yaml ingress: enabled: false resources: requests: cpu: 500m memory: 1Gi limits: cpu: 2000m memory: 4Gi # Pod Anti-affinity for HA affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchLabels: app.kubernetes.io/component: query-service topologyKey: kubernetes.io/hostname # Environment variables (new format - replaces configVars) env: signoz_telemetrystore_provider: "clickhouse" dot_metrics_enabled: "true" signoz_emailing_enabled: "true" signoz_alertmanager_provider: "signoz" # Retention configuration (30 days for prod) signoz_traces_ttl_duration_hrs: "720" signoz_metrics_ttl_duration_hrs: "720" signoz_logs_ttl_duration_hrs: "720" # OpAMP Server Configuration # WARNING: OpAMP can cause gRPC instability and collector reloads # Only enable if you have a stable OpAMP backend server signoz_opamp_server_enabled: "false" # signoz_opamp_server_endpoint: "0.0.0.0:4320" # SMTP configuration for email alerts signoz_smtp_enabled: "true" signoz_smtp_host: "smtp.gmail.com" signoz_smtp_port: "587" signoz_smtp_from: "alerts@bakewise.ai" signoz_smtp_username: "alerts@bakewise.ai" # Password should be set via secret: signoz_smtp_password persistence: enabled: true size: 20Gi storageClass: "standard" # Horizontal Pod Autoscaler autoscaling: enabled: true minReplicas: 2 maxReplicas: 5 targetCPUUtilizationPercentage: 70 targetMemoryUtilizationPercentage: 80 # AlertManager Configuration alertmanager: enabled: true replicaCount: 2 image: repository: signoz/alertmanager tag: 0.23.5 pullPolicy: IfNotPresent service: type: ClusterIP port: 9093 resources: requests: cpu: 100m memory: 128Mi limits: cpu: 500m memory: 512Mi # Pod Anti-affinity for HA affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchExpressions: - key: app operator: In values: - signoz-alertmanager topologyKey: kubernetes.io/hostname persistence: enabled: true size: 5Gi storageClass: "standard" config: global: resolve_timeout: 5m smtp_smarthost: 'smtp.gmail.com:587' smtp_from: 'alerts@bakewise.ai' smtp_auth_username: 'alerts@bakewise.ai' smtp_auth_password: '${SMTP_PASSWORD}' smtp_require_tls: true route: group_by: ['alertname', 'cluster', 'service', 'severity'] group_wait: 10s group_interval: 10s repeat_interval: 12h receiver: 'critical-alerts' routes: - match: severity: critical receiver: 'critical-alerts' continue: true - match: severity: warning receiver: 'warning-alerts' receivers: - name: 'critical-alerts' email_configs: - to: 'critical-alerts@bakewise.ai' headers: Subject: '[CRITICAL] {{ .GroupLabels.alertname }} - Bakery IA' # Slack webhook for critical alerts slack_configs: - api_url: '${SLACK_WEBHOOK_URL}' channel: '#alerts-critical' title: '[CRITICAL] {{ .GroupLabels.alertname }}' text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}' - name: 'warning-alerts' email_configs: - to: 'oncall@bakewise.ai' headers: Subject: '[WARNING] {{ .GroupLabels.alertname }} - Bakery IA' # ClickHouse Configuration - Time Series Database clickhouse: enabled: true installCustomStorageClass: false image: registry: docker.io repository: clickhouse/clickhouse-server tag: 25.5.6 # Updated to official recommended version pullPolicy: IfNotPresent # ClickHouse resources (nested config) clickhouse: resources: requests: cpu: 1000m memory: 2Gi limits: cpu: 4000m memory: 8Gi # Pod Anti-affinity for HA affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: app operator: In values: - signoz-clickhouse topologyKey: kubernetes.io/hostname persistence: enabled: true size: 100Gi storageClass: "standard" # Cold storage configuration for better disk space management coldStorage: enabled: true defaultKeepFreeSpaceBytes: 10737418240 # Keep 10GB free ttl: deleteTTLDays: 30 # Move old data to cold storage after 30 days # Zookeeper Configuration (required by ClickHouse for coordination) zookeeper: enabled: true replicaCount: 3 # CRITICAL: Always use 3 replicas for production HA image: tag: 3.7.1 # Official recommended version resources: requests: cpu: 100m memory: 256Mi limits: cpu: 500m memory: 512Mi persistence: enabled: true size: 10Gi storageClass: "standard" # OpenTelemetry Collector - Integrated with SigNoz otelCollector: enabled: true replicaCount: 2 image: repository: signoz/signoz-otel-collector tag: v0.129.12 # Updated to latest recommended version pullPolicy: IfNotPresent # Init containers for the Otel Collector pod initContainers: fix-postgres-tls: enabled: true image: registry: docker.io repository: busybox tag: 1.35 pullPolicy: IfNotPresent command: - sh - -c - | echo "Fixing PostgreSQL TLS file permissions..." cp /etc/postgres-tls-source/* /etc/postgres-tls/ chmod 600 /etc/postgres-tls/server-key.pem chmod 644 /etc/postgres-tls/server-cert.pem chmod 644 /etc/postgres-tls/ca-cert.pem echo "PostgreSQL TLS permissions fixed" volumeMounts: - name: postgres-tls-source mountPath: /etc/postgres-tls-source readOnly: true - name: postgres-tls-fixed mountPath: /etc/postgres-tls readOnly: false service: type: ClusterIP ports: - name: otlp-grpc port: 4317 targetPort: 4317 protocol: TCP - name: otlp-http port: 4318 targetPort: 4318 protocol: TCP - name: prometheus port: 8889 targetPort: 8889 protocol: TCP - name: metrics port: 8888 targetPort: 8888 protocol: TCP resources: requests: cpu: 500m memory: 512Mi limits: cpu: 2000m memory: 2Gi # Additional environment variables for receivers additionalEnvs: POSTGRES_MONITOR_USER: "monitoring" POSTGRES_MONITOR_PASSWORD: "monitoring_369f9c001f242b07ef9e2826e17169ca" REDIS_PASSWORD: "OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k" RABBITMQ_USER: "bakery" RABBITMQ_PASSWORD: "forecast123" # Mount TLS certificates for secure connections extraVolumes: - name: redis-tls secret: secretName: redis-tls-secret - name: postgres-tls secret: secretName: postgres-tls - name: postgres-tls-fixed emptyDir: {} - name: varlogpods hostPath: path: /var/log/pods extraVolumeMounts: - name: redis-tls mountPath: /etc/redis-tls readOnly: true - name: postgres-tls mountPath: /etc/postgres-tls-source readOnly: true - name: postgres-tls-fixed mountPath: /etc/postgres-tls readOnly: false - name: varlogpods mountPath: /var/log/pods readOnly: true # Enable OpAMP for dynamic configuration management command: name: /signoz-otel-collector extraArgs: - --config=/conf/otel-collector-config.yaml - --manager-config=/conf/otel-collector-opamp-config.yaml - --feature-gates=-pkg.translator.prometheus.NormalizeName # Full OTEL Collector Configuration config: # Connectors - bridge between pipelines connectors: signozmeter: dimensions: - name: service.name - name: deployment.environment - name: host.name metrics_flush_interval: 1h extensions: health_check: endpoint: 0.0.0.0:13133 zpages: endpoint: 0.0.0.0:55679 receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 max_recv_msg_size_mib: 32 # Increased for larger payloads http: endpoint: 0.0.0.0:4318 cors: allowed_origins: - "https://monitoring.bakewise.ai" - "https://*.bakewise.ai" # Filelog receiver for Kubernetes pod logs # Collects container stdout/stderr from /var/log/pods filelog: include: - /var/log/pods/*/*/*.log exclude: # Exclude SigNoz's own logs to avoid recursive collection - /var/log/pods/bakery-ia_signoz-*/*/*.log include_file_path: true include_file_name: false operators: # Parse CRI-O / containerd log format - type: regex_parser regex: '^(?P