Files
bakery-ia/infrastructure/helm/signoz-values-prod.yaml
2026-01-09 06:57:18 +01:00

479 lines
11 KiB
YAML

# SigNoz Helm Chart Values - Production Environment
# High-availability configuration with resource optimization
#
# Official Chart: https://github.com/SigNoz/charts
# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-prod.yaml
global:
storageClass: "standard"
domain: "monitoring.bakewise.ai"
# Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)
imagePullSecrets:
- dockerhub-creds
# Docker Hub credentials for pulling images (root level for SigNoz components)
imagePullSecrets:
- dockerhub-creds
# Frontend Configuration
frontend:
replicaCount: 2
image:
repository: signoz/frontend
tag: 0.52.3
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 3301
ingress:
enabled: true
className: nginx
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /$2
nginx.ingress.kubernetes.io/use-regex: "true"
cert-manager.io/cluster-issuer: "letsencrypt-prod"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
hosts:
- host: monitoring.bakewise.ai
paths:
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
tls:
- secretName: signoz-tls
hosts:
- monitoring.bakewise.ai
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 500m
memory: 1Gi
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-frontend
topologyKey: kubernetes.io/hostname
env:
- name: FRONTEND_REFRESH_INTERVAL
value: "30000"
# Query Service Configuration
queryService:
replicaCount: 2
image:
repository: signoz/query-service
tag: 0.52.3
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 8080
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 1000m
memory: 2Gi
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-query-service
topologyKey: kubernetes.io/hostname
env:
- name: DEPLOYMENT_TYPE
value: "kubernetes-helm"
- name: SIGNOZ_LOCAL_DB_PATH
value: "/var/lib/signoz"
- name: RETENTION_DAYS
value: "30"
persistence:
enabled: true
size: 20Gi
storageClass: "standard"
# Horizontal Pod Autoscaler
autoscaling:
enabled: true
minReplicas: 2
maxReplicas: 5
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
# AlertManager Configuration
alertmanager:
replicaCount: 2
image:
repository: signoz/alertmanager
tag: 0.23.5
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 9093
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 500m
memory: 1Gi
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-alertmanager
topologyKey: kubernetes.io/hostname
persistence:
enabled: true
size: 5Gi
storageClass: "standard"
config:
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@bakewise.ai'
smtp_auth_username: 'alerts@bakewise.ai'
smtp_auth_password: '${SMTP_PASSWORD}'
smtp_require_tls: true
route:
group_by: ['alertname', 'cluster', 'service', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'critical-alerts'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
continue: true
- match:
severity: warning
receiver: 'warning-alerts'
receivers:
- name: 'critical-alerts'
email_configs:
- to: 'critical-alerts@bakewise.ai'
headers:
Subject: '[CRITICAL] {{ .GroupLabels.alertname }} - Bakery IA'
# Slack webhook for critical alerts
slack_configs:
- api_url: '${SLACK_WEBHOOK_URL}'
channel: '#alerts-critical'
title: '[CRITICAL] {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'warning-alerts'
email_configs:
- to: 'oncall@bakewise.ai'
headers:
Subject: '[WARNING] {{ .GroupLabels.alertname }} - Bakery IA'
# ClickHouse Configuration - Time Series Database
clickhouse:
replicaCount: 2
image:
repository: clickhouse/clickhouse-server
tag: 24.1.2-alpine
pullPolicy: IfNotPresent
service:
type: ClusterIP
httpPort: 8123
tcpPort: 9000
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
cpu: 2000m
memory: 4Gi
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-clickhouse
topologyKey: kubernetes.io/hostname
persistence:
enabled: true
size: 100Gi
storageClass: "standard"
# ClickHouse configuration
config:
logger:
level: information
max_connections: 4096
max_concurrent_queries: 500
# Data retention (30 days for prod)
merge_tree:
parts_to_delay_insert: 150
parts_to_throw_insert: 300
# Performance tuning
max_memory_usage: 10000000000
max_bytes_before_external_group_by: 20000000000
# Backup configuration
backup:
enabled: true
schedule: "0 2 * * *"
retention: 7
# OpenTelemetry Collector - Integrated with SigNoz
otelCollector:
enabled: true
replicaCount: 2
image:
repository: signoz/signoz-otel-collector
tag: 0.102.8
pullPolicy: IfNotPresent
service:
type: ClusterIP
ports:
otlpGrpc: 4317
otlpHttp: 4318
metrics: 8888
healthCheck: 13133
resources:
requests:
cpu: 500m
memory: 512Mi
limits:
cpu: 1000m
memory: 1Gi
# Full OTEL Collector Configuration
config:
extensions:
health_check:
endpoint: 0.0.0.0:13133
zpages:
endpoint: 0.0.0.0:55679
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
max_recv_msg_size_mib: 16
http:
endpoint: 0.0.0.0:4318
cors:
allowed_origins:
- "https://monitoring.bakewise.ai"
- "https://*.bakewise.ai"
# Prometheus receiver for scraping metrics
prometheus:
config:
scrape_configs:
- job_name: 'otel-collector'
scrape_interval: 30s
static_configs:
- targets: ['localhost:8888']
processors:
batch:
timeout: 10s
send_batch_size: 2048
send_batch_max_size: 4096
memory_limiter:
check_interval: 1s
limit_mib: 800
spike_limit_mib: 200
# Resource detection for K8s
resourcedetection:
detectors: [env, system, docker]
timeout: 5s
# Add resource attributes
resource:
attributes:
- key: deployment.environment
value: production
action: upsert
- key: cluster.name
value: bakery-ia-prod
action: upsert
exporters:
# Export to SigNoz ClickHouse
clickhousetraces:
datasource: tcp://clickhouse:9000/?database=signoz_traces
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
signozclickhousemetrics:
endpoint: "tcp://clickhouse:9000/?database=signoz_metrics"
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
clickhouselogsexporter:
dsn: tcp://clickhouse:9000/?database=signoz_logs
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
# Debug exporter for debugging (replaces deprecated logging exporter)
debug:
verbosity: detailed
sampling_initial: 2
sampling_thereafter: 500
service:
extensions: [health_check, zpages]
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhousetraces, debug]
metrics:
receivers: [otlp, prometheus]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [signozclickhousemetrics]
logs:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhouselogsexporter, debug]
# OpenTelemetry Collector Deployment Mode
otelCollectorDeployment:
enabled: true
mode: deployment
# HPA for OTEL Collector
autoscaling:
enabled: true
minReplicas: 2
maxReplicas: 10
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
# Node Exporter for infrastructure metrics
nodeExporter:
enabled: true
service:
type: ClusterIP
port: 9100
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
# Schemamanager - Manages ClickHouse schema
schemamanager:
enabled: true
image:
repository: signoz/signoz-schema-migrator
tag: 0.52.3
pullPolicy: IfNotPresent
# Additional Configuration
serviceAccount:
create: true
annotations: {}
name: "signoz"
# Security Context
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
# Pod Disruption Budgets for HA
podDisruptionBudget:
frontend:
enabled: true
minAvailable: 1
queryService:
enabled: true
minAvailable: 1
alertmanager:
enabled: true
minAvailable: 1
clickhouse:
enabled: true
minAvailable: 1
# Network Policies for security
networkPolicy:
enabled: true
policyTypes:
- Ingress
- Egress
# Monitoring SigNoz itself
selfMonitoring:
enabled: true
serviceMonitor:
enabled: true
interval: 30s