Files
bakery-ia/infrastructure/helm/signoz-values-prod.yaml

492 lines
13 KiB
YAML
Raw Normal View History

2026-01-08 12:58:00 +01:00
# SigNoz Helm Chart Values - Production Environment
# High-availability configuration with resource optimization
2026-01-09 07:26:11 +01:00
# DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by bakery-ingress-prod
2026-01-08 12:58:00 +01:00
#
# Official Chart: https://github.com/SigNoz/charts
2026-01-09 07:26:11 +01:00
# Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml
2026-01-08 12:58:00 +01:00
global:
2026-01-09 07:26:11 +01:00
storageClass: "standard" # For MicroK8s, use "microk8s-hostpath" or custom storage class
clusterName: "bakery-ia-prod"
2026-01-08 12:58:00 +01:00
domain: "monitoring.bakewise.ai"
2026-01-09 06:57:18 +01:00
# Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)
imagePullSecrets:
- dockerhub-creds
# Docker Hub credentials for pulling images (root level for SigNoz components)
imagePullSecrets:
- dockerhub-creds
2026-01-08 12:58:00 +01:00
2026-01-09 07:26:11 +01:00
# SigNoz Main Component (unified frontend + query service)
# BREAKING CHANGE: v0.89.0+ uses unified component instead of separate frontend/queryService
signoz:
2026-01-08 12:58:00 +01:00
replicaCount: 2
2026-01-09 07:26:11 +01:00
2026-01-08 12:58:00 +01:00
image:
2026-01-09 07:26:11 +01:00
repository: signoz/signoz
tag: v0.106.0 # Latest stable version
2026-01-08 12:58:00 +01:00
pullPolicy: IfNotPresent
service:
type: ClusterIP
2026-01-09 07:26:11 +01:00
port: 8080 # HTTP/API port
internalPort: 8085 # Internal gRPC port
2026-01-08 12:58:00 +01:00
2026-01-09 07:26:11 +01:00
# DISABLE built-in ingress - using unified bakery-ingress-prod instead
# Route configured in infrastructure/kubernetes/overlays/prod/prod-ingress.yaml
2026-01-08 12:58:00 +01:00
ingress:
2026-01-09 07:26:11 +01:00
enabled: false
2026-01-08 12:58:00 +01:00
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
2026-01-09 07:26:11 +01:00
cpu: 2000m
memory: 4Gi
2026-01-08 12:58:00 +01:00
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
2026-01-09 07:26:11 +01:00
matchLabels:
app.kubernetes.io/component: query-service
2026-01-08 12:58:00 +01:00
topologyKey: kubernetes.io/hostname
2026-01-09 07:26:11 +01:00
# Environment variables (new format - replaces configVars)
2026-01-08 12:58:00 +01:00
env:
2026-01-09 07:26:11 +01:00
signoz_telemetrystore_provider: "clickhouse"
dot_metrics_enabled: "true"
signoz_emailing_enabled: "true"
signoz_alertmanager_provider: "signoz"
# Retention configuration (30 days for prod)
signoz_traces_ttl_duration_hrs: "720"
signoz_metrics_ttl_duration_hrs: "720"
signoz_logs_ttl_duration_hrs: "720"
# SMTP configuration for email alerts
signoz_smtp_enabled: "true"
signoz_smtp_host: "smtp.gmail.com"
signoz_smtp_port: "587"
signoz_smtp_from: "alerts@bakewise.ai"
signoz_smtp_username: "alerts@bakewise.ai"
# Password should be set via secret: signoz_smtp_password
2026-01-08 12:58:00 +01:00
persistence:
enabled: true
size: 20Gi
storageClass: "standard"
# Horizontal Pod Autoscaler
autoscaling:
enabled: true
minReplicas: 2
maxReplicas: 5
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
# AlertManager Configuration
alertmanager:
2026-01-09 07:26:11 +01:00
enabled: true
2026-01-08 12:58:00 +01:00
replicaCount: 2
2026-01-09 07:26:11 +01:00
2026-01-08 12:58:00 +01:00
image:
repository: signoz/alertmanager
tag: 0.23.5
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 9093
resources:
requests:
2026-01-09 07:26:11 +01:00
cpu: 100m
memory: 128Mi
2026-01-08 12:58:00 +01:00
limits:
cpu: 500m
2026-01-09 07:26:11 +01:00
memory: 512Mi
2026-01-08 12:58:00 +01:00
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-alertmanager
topologyKey: kubernetes.io/hostname
persistence:
enabled: true
size: 5Gi
storageClass: "standard"
config:
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@bakewise.ai'
smtp_auth_username: 'alerts@bakewise.ai'
smtp_auth_password: '${SMTP_PASSWORD}'
smtp_require_tls: true
route:
group_by: ['alertname', 'cluster', 'service', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'critical-alerts'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
continue: true
- match:
severity: warning
receiver: 'warning-alerts'
receivers:
- name: 'critical-alerts'
email_configs:
- to: 'critical-alerts@bakewise.ai'
headers:
Subject: '[CRITICAL] {{ .GroupLabels.alertname }} - Bakery IA'
# Slack webhook for critical alerts
slack_configs:
- api_url: '${SLACK_WEBHOOK_URL}'
channel: '#alerts-critical'
title: '[CRITICAL] {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'warning-alerts'
email_configs:
- to: 'oncall@bakewise.ai'
headers:
Subject: '[WARNING] {{ .GroupLabels.alertname }} - Bakery IA'
# ClickHouse Configuration - Time Series Database
clickhouse:
2026-01-09 07:26:11 +01:00
enabled: true
installCustomStorageClass: false
2026-01-08 12:58:00 +01:00
image:
2026-01-09 07:26:11 +01:00
registry: docker.io
2026-01-08 12:58:00 +01:00
repository: clickhouse/clickhouse-server
2026-01-09 07:26:11 +01:00
tag: 25.5.6 # Updated to official recommended version
2026-01-08 12:58:00 +01:00
pullPolicy: IfNotPresent
2026-01-09 07:26:11 +01:00
# ClickHouse resources (nested config)
clickhouse:
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
cpu: 4000m
memory: 8Gi
2026-01-08 12:58:00 +01:00
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-clickhouse
topologyKey: kubernetes.io/hostname
persistence:
enabled: true
size: 100Gi
storageClass: "standard"
2026-01-09 07:26:11 +01:00
# Cold storage configuration for better disk space management
coldStorage:
2026-01-08 12:58:00 +01:00
enabled: true
2026-01-09 07:26:11 +01:00
defaultKeepFreeSpaceBytes: 10737418240 # Keep 10GB free
ttl:
deleteTTLDays: 30 # Move old data to cold storage after 30 days
# Zookeeper Configuration (required by ClickHouse for coordination)
zookeeper:
enabled: true
replicaCount: 3 # CRITICAL: Always use 3 replicas for production HA
image:
tag: 3.7.1 # Official recommended version
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
persistence:
enabled: true
size: 10Gi
storageClass: "standard"
2026-01-08 12:58:00 +01:00
# OpenTelemetry Collector - Integrated with SigNoz
otelCollector:
enabled: true
replicaCount: 2
2026-01-09 07:26:11 +01:00
2026-01-08 12:58:00 +01:00
image:
repository: signoz/signoz-otel-collector
2026-01-09 07:26:11 +01:00
tag: v0.129.12 # Updated to latest recommended version
2026-01-08 12:58:00 +01:00
pullPolicy: IfNotPresent
service:
type: ClusterIP
ports:
2026-01-09 07:26:11 +01:00
- name: otlp-grpc
port: 4317
- name: otlp-http
port: 4318
- name: metrics
port: 8888
- name: healthcheck
port: 13133
2026-01-08 12:58:00 +01:00
resources:
requests:
cpu: 500m
memory: 512Mi
limits:
2026-01-09 07:26:11 +01:00
cpu: 2000m
memory: 2Gi
2026-01-08 12:58:00 +01:00
# Full OTEL Collector Configuration
config:
2026-01-09 11:18:20 +01:00
# Connectors - bridge between pipelines
connectors:
signozmeter:
dimensions:
- name: service.name
- name: deployment.environment
- name: host.name
metrics_flush_interval: 1h
2026-01-08 12:58:00 +01:00
extensions:
health_check:
endpoint: 0.0.0.0:13133
zpages:
endpoint: 0.0.0.0:55679
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
2026-01-09 07:26:11 +01:00
max_recv_msg_size_mib: 32 # Increased for larger payloads
2026-01-08 12:58:00 +01:00
http:
endpoint: 0.0.0.0:4318
cors:
allowed_origins:
- "https://monitoring.bakewise.ai"
- "https://*.bakewise.ai"
# Prometheus receiver for scraping metrics
prometheus:
config:
scrape_configs:
- job_name: 'otel-collector'
scrape_interval: 30s
static_configs:
- targets: ['localhost:8888']
processors:
2026-01-09 07:26:11 +01:00
# High-performance batch processing (official recommendation)
2026-01-08 12:58:00 +01:00
batch:
2026-01-09 07:26:11 +01:00
timeout: 1s # Reduced from 10s for faster processing
send_batch_size: 50000 # Increased from 2048 (official recommendation for traces)
send_batch_max_size: 50000
2026-01-08 12:58:00 +01:00
2026-01-09 11:18:20 +01:00
# Batch processor for meter data
batch/meter:
timeout: 1s
send_batch_size: 20000
send_batch_max_size: 25000
2026-01-08 12:58:00 +01:00
memory_limiter:
check_interval: 1s
2026-01-09 07:26:11 +01:00
limit_mib: 1500 # 75% of container memory (2Gi = ~2048Mi)
spike_limit_mib: 300
2026-01-08 12:58:00 +01:00
# Resource detection for K8s
resourcedetection:
2026-01-09 07:26:11 +01:00
detectors: [env, system, docker, kubernetes]
2026-01-08 12:58:00 +01:00
timeout: 5s
# Add resource attributes
resource:
attributes:
- key: deployment.environment
value: production
action: upsert
- key: cluster.name
value: bakery-ia-prod
action: upsert
2026-01-09 11:18:20 +01:00
# SigNoz span metrics processor with delta aggregation (recommended)
# Generates RED metrics (Rate, Error, Duration) from trace spans
signozspanmetrics/delta:
aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA
2026-01-09 07:26:11 +01:00
metrics_exporter: signozclickhousemetrics
2026-01-09 11:18:20 +01:00
latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s]
2026-01-09 07:26:11 +01:00
dimensions_cache_size: 100000
2026-01-09 11:18:20 +01:00
dimensions:
- name: service.namespace
default: default
- name: deployment.environment
default: production
- name: signoz.collector.id
2026-01-09 07:26:11 +01:00
2026-01-08 12:58:00 +01:00
exporters:
# Export to SigNoz ClickHouse
clickhousetraces:
datasource: tcp://clickhouse:9000/?database=signoz_traces
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
2026-01-09 06:57:18 +01:00
signozclickhousemetrics:
endpoint: "tcp://clickhouse:9000/?database=signoz_metrics"
2026-01-08 12:58:00 +01:00
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
2026-01-09 11:18:20 +01:00
# ClickHouse exporter for meter data (usage metrics)
signozclickhousemeter:
dsn: "tcp://clickhouse:9000/?database=signoz_meter"
timeout: 45s
sending_queue:
enabled: false
2026-01-08 12:58:00 +01:00
clickhouselogsexporter:
dsn: tcp://clickhouse:9000/?database=signoz_logs
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
2026-01-09 11:18:20 +01:00
# Metadata exporter for service metadata
metadataexporter:
dsn: "tcp://clickhouse:9000/?database=signoz_metadata"
timeout: 10s
cache:
provider: in_memory
2026-01-09 06:57:18 +01:00
# Debug exporter for debugging (replaces deprecated logging exporter)
debug:
verbosity: detailed
2026-01-08 12:58:00 +01:00
sampling_initial: 2
sampling_thereafter: 500
service:
extensions: [health_check, zpages]
pipelines:
2026-01-09 11:18:20 +01:00
# Traces pipeline - exports to ClickHouse and signozmeter connector
2026-01-08 12:58:00 +01:00
traces:
receivers: [otlp]
2026-01-09 11:18:20 +01:00
processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection, resource]
exporters: [clickhousetraces, metadataexporter, signozmeter]
2026-01-08 12:58:00 +01:00
2026-01-09 11:18:20 +01:00
# Metrics pipeline
2026-01-08 12:58:00 +01:00
metrics:
receivers: [otlp, prometheus]
processors: [memory_limiter, batch, resourcedetection, resource]
2026-01-09 06:57:18 +01:00
exporters: [signozclickhousemetrics]
2026-01-08 12:58:00 +01:00
2026-01-09 11:18:20 +01:00
# Meter pipeline - receives from signozmeter connector
metrics/meter:
receivers: [signozmeter]
processors: [batch/meter]
exporters: [signozclickhousemeter]
# Logs pipeline
2026-01-08 12:58:00 +01:00
logs:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource]
2026-01-09 07:26:11 +01:00
exporters: [clickhouselogsexporter]
2026-01-08 12:58:00 +01:00
# HPA for OTEL Collector
autoscaling:
enabled: true
minReplicas: 2
maxReplicas: 10
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
2026-01-09 07:26:11 +01:00
# Schema Migrator - Manages ClickHouse schema migrations
schemaMigrator:
2026-01-08 12:58:00 +01:00
enabled: true
image:
repository: signoz/signoz-schema-migrator
2026-01-09 07:26:11 +01:00
tag: v0.129.12 # Updated to latest version
2026-01-08 12:58:00 +01:00
pullPolicy: IfNotPresent
2026-01-09 07:26:11 +01:00
# Enable Helm hooks for proper upgrade handling
upgradeHelmHooks: true
2026-01-08 12:58:00 +01:00
# Additional Configuration
serviceAccount:
create: true
annotations: {}
name: "signoz"
# Security Context
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
# Pod Disruption Budgets for HA
podDisruptionBudget:
frontend:
enabled: true
minAvailable: 1
queryService:
enabled: true
minAvailable: 1
alertmanager:
enabled: true
minAvailable: 1
clickhouse:
enabled: true
minAvailable: 1
# Network Policies for security
networkPolicy:
enabled: true
policyTypes:
- Ingress
- Egress
# Monitoring SigNoz itself
selfMonitoring:
enabled: true
serviceMonitor:
enabled: true
interval: 30s