472 lines
11 KiB
YAML
472 lines
11 KiB
YAML
# SigNoz Helm Chart Values - Production Environment
|
|
# High-availability configuration with resource optimization
|
|
#
|
|
# Official Chart: https://github.com/SigNoz/charts
|
|
# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-prod.yaml
|
|
|
|
global:
|
|
storageClass: "standard"
|
|
domain: "monitoring.bakewise.ai"
|
|
|
|
# Frontend Configuration
|
|
frontend:
|
|
replicaCount: 2
|
|
image:
|
|
repository: signoz/frontend
|
|
tag: 0.52.3
|
|
pullPolicy: IfNotPresent
|
|
|
|
service:
|
|
type: ClusterIP
|
|
port: 3301
|
|
|
|
ingress:
|
|
enabled: true
|
|
className: nginx
|
|
annotations:
|
|
nginx.ingress.kubernetes.io/rewrite-target: /$2
|
|
nginx.ingress.kubernetes.io/use-regex: "true"
|
|
cert-manager.io/cluster-issuer: "letsencrypt-prod"
|
|
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
|
hosts:
|
|
- host: monitoring.bakewise.ai
|
|
paths:
|
|
- path: /signoz(/|$)(.*)
|
|
pathType: ImplementationSpecific
|
|
tls:
|
|
- secretName: signoz-tls
|
|
hosts:
|
|
- monitoring.bakewise.ai
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 250m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 1Gi
|
|
|
|
# Pod Anti-affinity for HA
|
|
affinity:
|
|
podAntiAffinity:
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
|
- weight: 100
|
|
podAffinityTerm:
|
|
labelSelector:
|
|
matchExpressions:
|
|
- key: app
|
|
operator: In
|
|
values:
|
|
- signoz-frontend
|
|
topologyKey: kubernetes.io/hostname
|
|
|
|
env:
|
|
- name: FRONTEND_REFRESH_INTERVAL
|
|
value: "30000"
|
|
|
|
# Query Service Configuration
|
|
queryService:
|
|
replicaCount: 2
|
|
image:
|
|
repository: signoz/query-service
|
|
tag: 0.52.3
|
|
pullPolicy: IfNotPresent
|
|
|
|
service:
|
|
type: ClusterIP
|
|
port: 8080
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 500m
|
|
memory: 1Gi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 2Gi
|
|
|
|
# Pod Anti-affinity for HA
|
|
affinity:
|
|
podAntiAffinity:
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
|
- weight: 100
|
|
podAffinityTerm:
|
|
labelSelector:
|
|
matchExpressions:
|
|
- key: app
|
|
operator: In
|
|
values:
|
|
- signoz-query-service
|
|
topologyKey: kubernetes.io/hostname
|
|
|
|
env:
|
|
- name: DEPLOYMENT_TYPE
|
|
value: "kubernetes-helm"
|
|
- name: SIGNOZ_LOCAL_DB_PATH
|
|
value: "/var/lib/signoz"
|
|
- name: RETENTION_DAYS
|
|
value: "30"
|
|
|
|
persistence:
|
|
enabled: true
|
|
size: 20Gi
|
|
storageClass: "standard"
|
|
|
|
# Horizontal Pod Autoscaler
|
|
autoscaling:
|
|
enabled: true
|
|
minReplicas: 2
|
|
maxReplicas: 5
|
|
targetCPUUtilizationPercentage: 70
|
|
targetMemoryUtilizationPercentage: 80
|
|
|
|
# AlertManager Configuration
|
|
alertmanager:
|
|
replicaCount: 2
|
|
image:
|
|
repository: signoz/alertmanager
|
|
tag: 0.23.5
|
|
pullPolicy: IfNotPresent
|
|
|
|
service:
|
|
type: ClusterIP
|
|
port: 9093
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 250m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 1Gi
|
|
|
|
# Pod Anti-affinity for HA
|
|
affinity:
|
|
podAntiAffinity:
|
|
preferredDuringSchedulingIgnoredDuringExecution:
|
|
- weight: 100
|
|
podAffinityTerm:
|
|
labelSelector:
|
|
matchExpressions:
|
|
- key: app
|
|
operator: In
|
|
values:
|
|
- signoz-alertmanager
|
|
topologyKey: kubernetes.io/hostname
|
|
|
|
persistence:
|
|
enabled: true
|
|
size: 5Gi
|
|
storageClass: "standard"
|
|
|
|
config:
|
|
global:
|
|
resolve_timeout: 5m
|
|
smtp_smarthost: 'smtp.gmail.com:587'
|
|
smtp_from: 'alerts@bakewise.ai'
|
|
smtp_auth_username: 'alerts@bakewise.ai'
|
|
smtp_auth_password: '${SMTP_PASSWORD}'
|
|
smtp_require_tls: true
|
|
|
|
route:
|
|
group_by: ['alertname', 'cluster', 'service', 'severity']
|
|
group_wait: 10s
|
|
group_interval: 10s
|
|
repeat_interval: 12h
|
|
receiver: 'critical-alerts'
|
|
routes:
|
|
- match:
|
|
severity: critical
|
|
receiver: 'critical-alerts'
|
|
continue: true
|
|
- match:
|
|
severity: warning
|
|
receiver: 'warning-alerts'
|
|
|
|
receivers:
|
|
- name: 'critical-alerts'
|
|
email_configs:
|
|
- to: 'critical-alerts@bakewise.ai'
|
|
headers:
|
|
Subject: '[CRITICAL] {{ .GroupLabels.alertname }} - Bakery IA'
|
|
# Slack webhook for critical alerts
|
|
slack_configs:
|
|
- api_url: '${SLACK_WEBHOOK_URL}'
|
|
channel: '#alerts-critical'
|
|
title: '[CRITICAL] {{ .GroupLabels.alertname }}'
|
|
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
|
|
|
- name: 'warning-alerts'
|
|
email_configs:
|
|
- to: 'oncall@bakewise.ai'
|
|
headers:
|
|
Subject: '[WARNING] {{ .GroupLabels.alertname }} - Bakery IA'
|
|
|
|
# ClickHouse Configuration - Time Series Database
|
|
clickhouse:
|
|
replicaCount: 2
|
|
image:
|
|
repository: clickhouse/clickhouse-server
|
|
tag: 24.1.2-alpine
|
|
pullPolicy: IfNotPresent
|
|
|
|
service:
|
|
type: ClusterIP
|
|
httpPort: 8123
|
|
tcpPort: 9000
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 1000m
|
|
memory: 2Gi
|
|
limits:
|
|
cpu: 2000m
|
|
memory: 4Gi
|
|
|
|
# Pod Anti-affinity for HA
|
|
affinity:
|
|
podAntiAffinity:
|
|
requiredDuringSchedulingIgnoredDuringExecution:
|
|
- labelSelector:
|
|
matchExpressions:
|
|
- key: app
|
|
operator: In
|
|
values:
|
|
- signoz-clickhouse
|
|
topologyKey: kubernetes.io/hostname
|
|
|
|
persistence:
|
|
enabled: true
|
|
size: 100Gi
|
|
storageClass: "standard"
|
|
|
|
# ClickHouse configuration
|
|
config:
|
|
logger:
|
|
level: information
|
|
max_connections: 4096
|
|
max_concurrent_queries: 500
|
|
# Data retention (30 days for prod)
|
|
merge_tree:
|
|
parts_to_delay_insert: 150
|
|
parts_to_throw_insert: 300
|
|
# Performance tuning
|
|
max_memory_usage: 10000000000
|
|
max_bytes_before_external_group_by: 20000000000
|
|
|
|
# Backup configuration
|
|
backup:
|
|
enabled: true
|
|
schedule: "0 2 * * *"
|
|
retention: 7
|
|
|
|
# OpenTelemetry Collector - Integrated with SigNoz
|
|
otelCollector:
|
|
enabled: true
|
|
replicaCount: 2
|
|
image:
|
|
repository: signoz/signoz-otel-collector
|
|
tag: 0.102.8
|
|
pullPolicy: IfNotPresent
|
|
|
|
service:
|
|
type: ClusterIP
|
|
ports:
|
|
otlpGrpc: 4317
|
|
otlpHttp: 4318
|
|
metrics: 8888
|
|
healthCheck: 13133
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 1Gi
|
|
|
|
# Full OTEL Collector Configuration
|
|
config:
|
|
extensions:
|
|
health_check:
|
|
endpoint: 0.0.0.0:13133
|
|
zpages:
|
|
endpoint: 0.0.0.0:55679
|
|
|
|
receivers:
|
|
otlp:
|
|
protocols:
|
|
grpc:
|
|
endpoint: 0.0.0.0:4317
|
|
max_recv_msg_size_mib: 16
|
|
http:
|
|
endpoint: 0.0.0.0:4318
|
|
cors:
|
|
allowed_origins:
|
|
- "https://monitoring.bakewise.ai"
|
|
- "https://*.bakewise.ai"
|
|
|
|
# Prometheus receiver for scraping metrics
|
|
prometheus:
|
|
config:
|
|
scrape_configs:
|
|
- job_name: 'otel-collector'
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ['localhost:8888']
|
|
|
|
processors:
|
|
batch:
|
|
timeout: 10s
|
|
send_batch_size: 2048
|
|
send_batch_max_size: 4096
|
|
|
|
memory_limiter:
|
|
check_interval: 1s
|
|
limit_mib: 800
|
|
spike_limit_mib: 200
|
|
|
|
# Resource detection for K8s
|
|
resourcedetection:
|
|
detectors: [env, system, docker]
|
|
timeout: 5s
|
|
|
|
# Add resource attributes
|
|
resource:
|
|
attributes:
|
|
- key: deployment.environment
|
|
value: production
|
|
action: upsert
|
|
- key: cluster.name
|
|
value: bakery-ia-prod
|
|
action: upsert
|
|
|
|
exporters:
|
|
# Export to SigNoz ClickHouse
|
|
clickhousetraces:
|
|
datasource: tcp://clickhouse:9000/?database=signoz_traces
|
|
timeout: 10s
|
|
retry_on_failure:
|
|
enabled: true
|
|
initial_interval: 5s
|
|
max_interval: 30s
|
|
max_elapsed_time: 300s
|
|
|
|
clickhousemetricswrite:
|
|
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
|
|
timeout: 10s
|
|
retry_on_failure:
|
|
enabled: true
|
|
initial_interval: 5s
|
|
max_interval: 30s
|
|
max_elapsed_time: 300s
|
|
|
|
clickhouselogsexporter:
|
|
dsn: tcp://clickhouse:9000/?database=signoz_logs
|
|
timeout: 10s
|
|
retry_on_failure:
|
|
enabled: true
|
|
initial_interval: 5s
|
|
max_interval: 30s
|
|
max_elapsed_time: 300s
|
|
|
|
# Minimal logging for prod
|
|
logging:
|
|
loglevel: warn
|
|
sampling_initial: 2
|
|
sampling_thereafter: 500
|
|
|
|
service:
|
|
extensions: [health_check, zpages]
|
|
pipelines:
|
|
traces:
|
|
receivers: [otlp]
|
|
processors: [memory_limiter, batch, resourcedetection, resource]
|
|
exporters: [clickhousetraces, logging]
|
|
|
|
metrics:
|
|
receivers: [otlp, prometheus]
|
|
processors: [memory_limiter, batch, resourcedetection, resource]
|
|
exporters: [clickhousemetricswrite]
|
|
|
|
logs:
|
|
receivers: [otlp]
|
|
processors: [memory_limiter, batch, resourcedetection, resource]
|
|
exporters: [clickhouselogsexporter, logging]
|
|
|
|
# OpenTelemetry Collector Deployment Mode
|
|
otelCollectorDeployment:
|
|
enabled: true
|
|
mode: deployment
|
|
|
|
# HPA for OTEL Collector
|
|
autoscaling:
|
|
enabled: true
|
|
minReplicas: 2
|
|
maxReplicas: 10
|
|
targetCPUUtilizationPercentage: 70
|
|
targetMemoryUtilizationPercentage: 80
|
|
|
|
# Node Exporter for infrastructure metrics
|
|
nodeExporter:
|
|
enabled: true
|
|
service:
|
|
type: ClusterIP
|
|
port: 9100
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 128Mi
|
|
limits:
|
|
cpu: 200m
|
|
memory: 256Mi
|
|
|
|
# Schemamanager - Manages ClickHouse schema
|
|
schemamanager:
|
|
enabled: true
|
|
image:
|
|
repository: signoz/signoz-schema-migrator
|
|
tag: 0.52.3
|
|
pullPolicy: IfNotPresent
|
|
|
|
# Additional Configuration
|
|
serviceAccount:
|
|
create: true
|
|
annotations: {}
|
|
name: "signoz"
|
|
|
|
# Security Context
|
|
securityContext:
|
|
runAsNonRoot: true
|
|
runAsUser: 1000
|
|
fsGroup: 1000
|
|
|
|
# Pod Disruption Budgets for HA
|
|
podDisruptionBudget:
|
|
frontend:
|
|
enabled: true
|
|
minAvailable: 1
|
|
queryService:
|
|
enabled: true
|
|
minAvailable: 1
|
|
alertmanager:
|
|
enabled: true
|
|
minAvailable: 1
|
|
clickhouse:
|
|
enabled: true
|
|
minAvailable: 1
|
|
|
|
# Network Policies for security
|
|
networkPolicy:
|
|
enabled: true
|
|
policyTypes:
|
|
- Ingress
|
|
- Egress
|
|
|
|
# Monitoring SigNoz itself
|
|
selfMonitoring:
|
|
enabled: true
|
|
serviceMonitor:
|
|
enabled: true
|
|
interval: 30s
|