# SigNoz Helm Chart Values - Production Environment # High-availability configuration with resource optimization # # Official Chart: https://github.com/SigNoz/charts # Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-prod.yaml global: storageClass: "standard" domain: "monitoring.bakewise.ai" # Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc) imagePullSecrets: - dockerhub-creds # Docker Hub credentials for pulling images (root level for SigNoz components) imagePullSecrets: - dockerhub-creds # Frontend Configuration frontend: replicaCount: 2 image: repository: signoz/frontend tag: 0.52.3 pullPolicy: IfNotPresent service: type: ClusterIP port: 3301 ingress: enabled: true className: nginx annotations: nginx.ingress.kubernetes.io/rewrite-target: /$2 nginx.ingress.kubernetes.io/use-regex: "true" cert-manager.io/cluster-issuer: "letsencrypt-prod" nginx.ingress.kubernetes.io/ssl-redirect: "true" hosts: - host: monitoring.bakewise.ai paths: - path: /signoz(/|$)(.*) pathType: ImplementationSpecific tls: - secretName: signoz-tls hosts: - monitoring.bakewise.ai resources: requests: cpu: 250m memory: 512Mi limits: cpu: 500m memory: 1Gi # Pod Anti-affinity for HA affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchExpressions: - key: app operator: In values: - signoz-frontend topologyKey: kubernetes.io/hostname env: - name: FRONTEND_REFRESH_INTERVAL value: "30000" # Query Service Configuration queryService: replicaCount: 2 image: repository: signoz/query-service tag: 0.52.3 pullPolicy: IfNotPresent service: type: ClusterIP port: 8080 resources: requests: cpu: 500m memory: 1Gi limits: cpu: 1000m memory: 2Gi # Pod Anti-affinity for HA affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchExpressions: - key: app operator: In values: - signoz-query-service topologyKey: kubernetes.io/hostname env: - name: DEPLOYMENT_TYPE value: "kubernetes-helm" - name: SIGNOZ_LOCAL_DB_PATH value: "/var/lib/signoz" - name: RETENTION_DAYS value: "30" persistence: enabled: true size: 20Gi storageClass: "standard" # Horizontal Pod Autoscaler autoscaling: enabled: true minReplicas: 2 maxReplicas: 5 targetCPUUtilizationPercentage: 70 targetMemoryUtilizationPercentage: 80 # AlertManager Configuration alertmanager: replicaCount: 2 image: repository: signoz/alertmanager tag: 0.23.5 pullPolicy: IfNotPresent service: type: ClusterIP port: 9093 resources: requests: cpu: 250m memory: 512Mi limits: cpu: 500m memory: 1Gi # Pod Anti-affinity for HA affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchExpressions: - key: app operator: In values: - signoz-alertmanager topologyKey: kubernetes.io/hostname persistence: enabled: true size: 5Gi storageClass: "standard" config: global: resolve_timeout: 5m smtp_smarthost: 'smtp.gmail.com:587' smtp_from: 'alerts@bakewise.ai' smtp_auth_username: 'alerts@bakewise.ai' smtp_auth_password: '${SMTP_PASSWORD}' smtp_require_tls: true route: group_by: ['alertname', 'cluster', 'service', 'severity'] group_wait: 10s group_interval: 10s repeat_interval: 12h receiver: 'critical-alerts' routes: - match: severity: critical receiver: 'critical-alerts' continue: true - match: severity: warning receiver: 'warning-alerts' receivers: - name: 'critical-alerts' email_configs: - to: 'critical-alerts@bakewise.ai' headers: Subject: '[CRITICAL] {{ .GroupLabels.alertname }} - Bakery IA' # Slack webhook for critical alerts slack_configs: - api_url: '${SLACK_WEBHOOK_URL}' channel: '#alerts-critical' title: '[CRITICAL] {{ .GroupLabels.alertname }}' text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}' - name: 'warning-alerts' email_configs: - to: 'oncall@bakewise.ai' headers: Subject: '[WARNING] {{ .GroupLabels.alertname }} - Bakery IA' # ClickHouse Configuration - Time Series Database clickhouse: replicaCount: 2 image: repository: clickhouse/clickhouse-server tag: 24.1.2-alpine pullPolicy: IfNotPresent service: type: ClusterIP httpPort: 8123 tcpPort: 9000 resources: requests: cpu: 1000m memory: 2Gi limits: cpu: 2000m memory: 4Gi # Pod Anti-affinity for HA affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: app operator: In values: - signoz-clickhouse topologyKey: kubernetes.io/hostname persistence: enabled: true size: 100Gi storageClass: "standard" # ClickHouse configuration config: logger: level: information max_connections: 4096 max_concurrent_queries: 500 # Data retention (30 days for prod) merge_tree: parts_to_delay_insert: 150 parts_to_throw_insert: 300 # Performance tuning max_memory_usage: 10000000000 max_bytes_before_external_group_by: 20000000000 # Backup configuration backup: enabled: true schedule: "0 2 * * *" retention: 7 # OpenTelemetry Collector - Integrated with SigNoz otelCollector: enabled: true replicaCount: 2 image: repository: signoz/signoz-otel-collector tag: 0.102.8 pullPolicy: IfNotPresent service: type: ClusterIP ports: otlpGrpc: 4317 otlpHttp: 4318 metrics: 8888 healthCheck: 13133 resources: requests: cpu: 500m memory: 512Mi limits: cpu: 1000m memory: 1Gi # Full OTEL Collector Configuration config: extensions: health_check: endpoint: 0.0.0.0:13133 zpages: endpoint: 0.0.0.0:55679 receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 max_recv_msg_size_mib: 16 http: endpoint: 0.0.0.0:4318 cors: allowed_origins: - "https://monitoring.bakewise.ai" - "https://*.bakewise.ai" # Prometheus receiver for scraping metrics prometheus: config: scrape_configs: - job_name: 'otel-collector' scrape_interval: 30s static_configs: - targets: ['localhost:8888'] processors: batch: timeout: 10s send_batch_size: 2048 send_batch_max_size: 4096 memory_limiter: check_interval: 1s limit_mib: 800 spike_limit_mib: 200 # Resource detection for K8s resourcedetection: detectors: [env, system, docker] timeout: 5s # Add resource attributes resource: attributes: - key: deployment.environment value: production action: upsert - key: cluster.name value: bakery-ia-prod action: upsert exporters: # Export to SigNoz ClickHouse clickhousetraces: datasource: tcp://clickhouse:9000/?database=signoz_traces timeout: 10s retry_on_failure: enabled: true initial_interval: 5s max_interval: 30s max_elapsed_time: 300s signozclickhousemetrics: endpoint: "tcp://clickhouse:9000/?database=signoz_metrics" timeout: 10s retry_on_failure: enabled: true initial_interval: 5s max_interval: 30s max_elapsed_time: 300s clickhouselogsexporter: dsn: tcp://clickhouse:9000/?database=signoz_logs timeout: 10s retry_on_failure: enabled: true initial_interval: 5s max_interval: 30s max_elapsed_time: 300s # Debug exporter for debugging (replaces deprecated logging exporter) debug: verbosity: detailed sampling_initial: 2 sampling_thereafter: 500 service: extensions: [health_check, zpages] pipelines: traces: receivers: [otlp] processors: [memory_limiter, batch, resourcedetection, resource] exporters: [clickhousetraces, debug] metrics: receivers: [otlp, prometheus] processors: [memory_limiter, batch, resourcedetection, resource] exporters: [signozclickhousemetrics] logs: receivers: [otlp] processors: [memory_limiter, batch, resourcedetection, resource] exporters: [clickhouselogsexporter, debug] # OpenTelemetry Collector Deployment Mode otelCollectorDeployment: enabled: true mode: deployment # HPA for OTEL Collector autoscaling: enabled: true minReplicas: 2 maxReplicas: 10 targetCPUUtilizationPercentage: 70 targetMemoryUtilizationPercentage: 80 # Node Exporter for infrastructure metrics nodeExporter: enabled: true service: type: ClusterIP port: 9100 resources: requests: cpu: 100m memory: 128Mi limits: cpu: 200m memory: 256Mi # Schemamanager - Manages ClickHouse schema schemamanager: enabled: true image: repository: signoz/signoz-schema-migrator tag: 0.52.3 pullPolicy: IfNotPresent # Additional Configuration serviceAccount: create: true annotations: {} name: "signoz" # Security Context securityContext: runAsNonRoot: true runAsUser: 1000 fsGroup: 1000 # Pod Disruption Budgets for HA podDisruptionBudget: frontend: enabled: true minAvailable: 1 queryService: enabled: true minAvailable: 1 alertmanager: enabled: true minAvailable: 1 clickhouse: enabled: true minAvailable: 1 # Network Policies for security networkPolicy: enabled: true policyTypes: - Ingress - Egress # Monitoring SigNoz itself selfMonitoring: enabled: true serviceMonitor: enabled: true interval: 30s