# SigNoz Helm Chart Values - Production Environment # High-availability configuration with resource optimization # DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by SigNoz Helm chart # # Official Chart: https://github.com/SigNoz/charts # Install Command: helm upgrade --install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml # # IMPORTANT: This chart works together with k8s-infra chart for infrastructure monitoring # Deploy k8s-infra after this: helm upgrade --install k8s-infra signoz/k8s-infra -n bakery-ia -f k8s-infra-values-prod.yaml # # MEMORY OPTIMIZATION NOTES: # - ClickHouse memory increased to 8Gi to prevent OOM errors # - Retention reduced to 3 days for traces, 7 days for metrics/logs global: storageClass: "microk8s-hostpath" clusterName: "bakery-ia-prod" domain: "monitoring.bakewise.ai" # Ingress configuration for SigNoz Frontend signoz: ingress: enabled: true className: nginx annotations: nginx.ingress.kubernetes.io/ssl-redirect: "true" nginx.ingress.kubernetes.io/force-ssl-redirect: "true" nginx.ingress.kubernetes.io/proxy-body-size: "100m" nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" cert-manager.io/cluster-issuer: "letsencrypt-production" nginx.ingress.kubernetes.io/limit-rps: "50" nginx.ingress.kubernetes.io/limit-connections: "25" hosts: - host: monitoring.bakewise.ai paths: - path: / pathType: ImplementationSpecific port: 8080 tls: - hosts: - monitoring.bakewise.ai secretName: bakery-ia-prod-tls-cert # ============================================================================ # CLICKHOUSE CONFIGURATION # Increased memory to 8Gi to prevent OOM errors (was 4Gi, causing code 241 errors) # ============================================================================ clickhouse: persistence: size: 20Gi resources: requests: memory: "4Gi" cpu: "1000m" limits: memory: "8Gi" cpu: "2000m" # Server-level settings only (NOT user-level settings like max_threads) # User-level settings must go in profiles section settings: # Max server memory usage: 80% of container limit (6.4GB of 8GB) max_server_memory_usage: "6400000000" # Mark cache size (256MB) mark_cache_size: "268435456" # Uncompressed cache (256MB) uncompressed_cache_size: "268435456" # Max concurrent queries max_concurrent_queries: "100" # User-level settings go in profiles profiles: default: # Max memory per query: 2GB max_memory_usage: "2000000000" # Max threads per query max_threads: "4" # Background merges memory limit max_bytes_to_merge_at_max_space_in_pool: "1073741824" coldStorage: enabled: false # ============================================================================ # DATA RETENTION CONFIGURATION # Reduced retention to minimize storage and memory pressure # ============================================================================ queryService: resources: requests: memory: "1Gi" cpu: "500m" limits: memory: "2Gi" cpu: "1000m" # Retention configuration via environment variables configVars: # Trace retention: 3 days (72 hours) SIGNOZ_TRACE_TTL_DURATION_HOURS: "72" # Logs retention: 7 days (168 hours) SIGNOZ_LOGS_TTL_DURATION_HOURS: "168" # Metrics retention: 7 days (168 hours) SIGNOZ_METRICS_TTL_DURATION_HOURS: "168" # ============================================================================ # OTEL COLLECTOR CONFIGURATION # This collector receives data from: # - Application services (traces, logs, metrics via OTLP) # - k8s-infra chart (infrastructure metrics) # ============================================================================ otelCollector: resources: requests: memory: "1Gi" cpu: "500m" limits: memory: "2Gi" cpu: "1000m" # ============================================================================ # ALERTMANAGER CONFIGURATION # ============================================================================ alertmanager: resources: requests: memory: "512Mi" cpu: "250m" limits: memory: "1Gi" cpu: "500m" # ============================================================================ # ZOOKEEPER CONFIGURATION # ============================================================================ zookeeper: resources: requests: memory: "512Mi" cpu: "250m" limits: memory: "1Gi" cpu: "500m" persistence: size: 5Gi