# SigNoz Helm Chart Values - Production Environment # High-availability configuration with resource optimization # DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by bakery-ingress-prod # # Official Chart: https://github.com/SigNoz/charts # Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml global: storageClass: "standard" # For MicroK8s, use "microk8s-hostpath" or custom storage class clusterName: "bakery-ia-prod" domain: "monitoring.bakewise.ai" # Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc) imagePullSecrets: - dockerhub-creds # Docker Hub credentials for pulling images (root level for SigNoz components) imagePullSecrets: - dockerhub-creds # SigNoz Main Component (unified frontend + query service) # BREAKING CHANGE: v0.89.0+ uses unified component instead of separate frontend/queryService signoz: replicaCount: 2 image: repository: signoz/signoz tag: v0.106.0 # Latest stable version pullPolicy: IfNotPresent service: type: ClusterIP port: 8080 # HTTP/API port internalPort: 8085 # Internal gRPC port # DISABLE built-in ingress - using unified bakery-ingress-prod instead # Route configured in infrastructure/kubernetes/overlays/prod/prod-ingress.yaml ingress: enabled: false resources: requests: cpu: 500m memory: 1Gi limits: cpu: 2000m memory: 4Gi # Pod Anti-affinity for HA affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchLabels: app.kubernetes.io/component: query-service topologyKey: kubernetes.io/hostname # Environment variables (new format - replaces configVars) env: signoz_telemetrystore_provider: "clickhouse" dot_metrics_enabled: "true" signoz_emailing_enabled: "true" signoz_alertmanager_provider: "signoz" # Retention configuration (30 days for prod) signoz_traces_ttl_duration_hrs: "720" signoz_metrics_ttl_duration_hrs: "720" signoz_logs_ttl_duration_hrs: "720" # SMTP configuration for email alerts signoz_smtp_enabled: "true" signoz_smtp_host: "smtp.gmail.com" signoz_smtp_port: "587" signoz_smtp_from: "alerts@bakewise.ai" signoz_smtp_username: "alerts@bakewise.ai" # Password should be set via secret: signoz_smtp_password persistence: enabled: true size: 20Gi storageClass: "standard" # Horizontal Pod Autoscaler autoscaling: enabled: true minReplicas: 2 maxReplicas: 5 targetCPUUtilizationPercentage: 70 targetMemoryUtilizationPercentage: 80 # AlertManager Configuration alertmanager: enabled: true replicaCount: 2 image: repository: signoz/alertmanager tag: 0.23.5 pullPolicy: IfNotPresent service: type: ClusterIP port: 9093 resources: requests: cpu: 100m memory: 128Mi limits: cpu: 500m memory: 512Mi # Pod Anti-affinity for HA affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchExpressions: - key: app operator: In values: - signoz-alertmanager topologyKey: kubernetes.io/hostname persistence: enabled: true size: 5Gi storageClass: "standard" config: global: resolve_timeout: 5m smtp_smarthost: 'smtp.gmail.com:587' smtp_from: 'alerts@bakewise.ai' smtp_auth_username: 'alerts@bakewise.ai' smtp_auth_password: '${SMTP_PASSWORD}' smtp_require_tls: true route: group_by: ['alertname', 'cluster', 'service', 'severity'] group_wait: 10s group_interval: 10s repeat_interval: 12h receiver: 'critical-alerts' routes: - match: severity: critical receiver: 'critical-alerts' continue: true - match: severity: warning receiver: 'warning-alerts' receivers: - name: 'critical-alerts' email_configs: - to: 'critical-alerts@bakewise.ai' headers: Subject: '[CRITICAL] {{ .GroupLabels.alertname }} - Bakery IA' # Slack webhook for critical alerts slack_configs: - api_url: '${SLACK_WEBHOOK_URL}' channel: '#alerts-critical' title: '[CRITICAL] {{ .GroupLabels.alertname }}' text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}' - name: 'warning-alerts' email_configs: - to: 'oncall@bakewise.ai' headers: Subject: '[WARNING] {{ .GroupLabels.alertname }} - Bakery IA' # ClickHouse Configuration - Time Series Database clickhouse: enabled: true installCustomStorageClass: false image: registry: docker.io repository: clickhouse/clickhouse-server tag: 25.5.6 # Updated to official recommended version pullPolicy: IfNotPresent # ClickHouse resources (nested config) clickhouse: resources: requests: cpu: 1000m memory: 2Gi limits: cpu: 4000m memory: 8Gi # Pod Anti-affinity for HA affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: app operator: In values: - signoz-clickhouse topologyKey: kubernetes.io/hostname persistence: enabled: true size: 100Gi storageClass: "standard" # Cold storage configuration for better disk space management coldStorage: enabled: true defaultKeepFreeSpaceBytes: 10737418240 # Keep 10GB free ttl: deleteTTLDays: 30 # Move old data to cold storage after 30 days # Zookeeper Configuration (required by ClickHouse for coordination) zookeeper: enabled: true replicaCount: 3 # CRITICAL: Always use 3 replicas for production HA image: tag: 3.7.1 # Official recommended version resources: requests: cpu: 100m memory: 256Mi limits: cpu: 500m memory: 512Mi persistence: enabled: true size: 10Gi storageClass: "standard" # OpenTelemetry Collector - Integrated with SigNoz otelCollector: enabled: true replicaCount: 2 image: repository: signoz/signoz-otel-collector tag: v0.129.12 # Updated to latest recommended version pullPolicy: IfNotPresent service: type: ClusterIP ports: - name: otlp-grpc port: 4317 - name: otlp-http port: 4318 - name: metrics port: 8888 - name: healthcheck port: 13133 resources: requests: cpu: 500m memory: 512Mi limits: cpu: 2000m memory: 2Gi # Full OTEL Collector Configuration config: # Connectors - bridge between pipelines connectors: signozmeter: dimensions: - name: service.name - name: deployment.environment - name: host.name metrics_flush_interval: 1h extensions: health_check: endpoint: 0.0.0.0:13133 zpages: endpoint: 0.0.0.0:55679 receivers: otlp: protocols: grpc: endpoint: 0.0.0.0:4317 max_recv_msg_size_mib: 32 # Increased for larger payloads http: endpoint: 0.0.0.0:4318 cors: allowed_origins: - "https://monitoring.bakewise.ai" - "https://*.bakewise.ai" # Prometheus receiver for scraping metrics prometheus: config: scrape_configs: - job_name: 'otel-collector' scrape_interval: 30s static_configs: - targets: ['localhost:8888'] processors: # High-performance batch processing (official recommendation) batch: timeout: 1s # Reduced from 10s for faster processing send_batch_size: 50000 # Increased from 2048 (official recommendation for traces) send_batch_max_size: 50000 # Batch processor for meter data batch/meter: timeout: 1s send_batch_size: 20000 send_batch_max_size: 25000 memory_limiter: check_interval: 1s limit_mib: 1500 # 75% of container memory (2Gi = ~2048Mi) spike_limit_mib: 300 # Resource detection for K8s resourcedetection: detectors: [env, system, docker, kubernetes] timeout: 5s # Add resource attributes resource: attributes: - key: deployment.environment value: production action: upsert - key: cluster.name value: bakery-ia-prod action: upsert # SigNoz span metrics processor with delta aggregation (recommended) # Generates RED metrics (Rate, Error, Duration) from trace spans signozspanmetrics/delta: aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA metrics_exporter: signozclickhousemetrics latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s] dimensions_cache_size: 100000 dimensions: - name: service.namespace default: default - name: deployment.environment default: production - name: signoz.collector.id exporters: # Export to SigNoz ClickHouse clickhousetraces: datasource: tcp://clickhouse:9000/?database=signoz_traces timeout: 10s retry_on_failure: enabled: true initial_interval: 5s max_interval: 30s max_elapsed_time: 300s signozclickhousemetrics: endpoint: "tcp://clickhouse:9000/?database=signoz_metrics" timeout: 10s retry_on_failure: enabled: true initial_interval: 5s max_interval: 30s max_elapsed_time: 300s # ClickHouse exporter for meter data (usage metrics) signozclickhousemeter: dsn: "tcp://clickhouse:9000/?database=signoz_meter" timeout: 45s sending_queue: enabled: false clickhouselogsexporter: dsn: tcp://clickhouse:9000/?database=signoz_logs timeout: 10s retry_on_failure: enabled: true initial_interval: 5s max_interval: 30s max_elapsed_time: 300s # Metadata exporter for service metadata metadataexporter: dsn: "tcp://clickhouse:9000/?database=signoz_metadata" timeout: 10s cache: provider: in_memory # Debug exporter for debugging (replaces deprecated logging exporter) debug: verbosity: detailed sampling_initial: 2 sampling_thereafter: 500 service: extensions: [health_check, zpages] pipelines: # Traces pipeline - exports to ClickHouse and signozmeter connector traces: receivers: [otlp] processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection, resource] exporters: [clickhousetraces, metadataexporter, signozmeter] # Metrics pipeline metrics: receivers: [otlp, prometheus] processors: [memory_limiter, batch, resourcedetection, resource] exporters: [signozclickhousemetrics] # Meter pipeline - receives from signozmeter connector metrics/meter: receivers: [signozmeter] processors: [batch/meter] exporters: [signozclickhousemeter] # Logs pipeline logs: receivers: [otlp] processors: [memory_limiter, batch, resourcedetection, resource] exporters: [clickhouselogsexporter] # HPA for OTEL Collector autoscaling: enabled: true minReplicas: 2 maxReplicas: 10 targetCPUUtilizationPercentage: 70 targetMemoryUtilizationPercentage: 80 # Schema Migrator - Manages ClickHouse schema migrations schemaMigrator: enabled: true image: repository: signoz/signoz-schema-migrator tag: v0.129.12 # Updated to latest version pullPolicy: IfNotPresent # Enable Helm hooks for proper upgrade handling upgradeHelmHooks: true # Additional Configuration serviceAccount: create: true annotations: {} name: "signoz" # Security Context securityContext: runAsNonRoot: true runAsUser: 1000 fsGroup: 1000 # Pod Disruption Budgets for HA podDisruptionBudget: frontend: enabled: true minAvailable: 1 queryService: enabled: true minAvailable: 1 alertmanager: enabled: true minAvailable: 1 clickhouse: enabled: true minAvailable: 1 # Network Policies for security networkPolicy: enabled: true policyTypes: - Ingress - Egress # Monitoring SigNoz itself selfMonitoring: enabled: true serviceMonitor: enabled: true interval: 30s